@dragonfive
2016-03-16T03:24:29.000000Z
字数 3568
阅读 571
python编程
#encoding:UTF-8import urllibimport urllib2url = "http://www.baidu.com"data = urllib2.urlopen(url).read()#data = data.decode('UTF-8')print(data)
def getImg(html):reg = r'src="(.+?\.jpg)" pic_ext'imgre = re.compile(reg)imglist = re.findall(imgre,html)return imglist
1 取的是()里的模式
2 .+?表示匹配任意长度的字符串
GET / HTTP/1.1
Accept: text/html, application/xhtml+xml, image/jxr, /
Accept-Language: zh-Hans-CN,zh-Hans;q=0.5
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko
Accept-Encoding: gzip, deflate
DNT: 1
Host: www.baidu.com
Connection: Keep-Alive
Cookie: BAIDUID=2C7FF7888B4CED50341979D818CCE4D4:FG=1; BIDUPSID=2C7FF7888B4CED50341979D818CCE4D4; PSTM=1457514949
用python写爬虫,需要注意假冒成浏览器的样子
用到抓包程序fiddler
首先下载网站若干船只的所有信息到一个对象列表里面保存下来
然后对每一个船只MMSI下载若干个船
图片方面:前面一些船宽度是300的 后来下载的改成800
csv方面:前面50个是10个一个文件的,后来发现网站的索引最多只支持50
#coding=utf-8import urllibimport reimport urllib2import timeimport numpy as npimport os# get an htmldef getHtml(url):req = urllib2.Request(url, headers = {'Connection': 'keep-alive','Accept': '*/*','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'})try:oper = urllib2.urlopen(req,timeout = 300)html = oper.read()return htmlexcept:return None# get all images in an htmldef getImg(html,preDir):reg = r"data-original='(.+?)0' data-title"imgre = re.compile(reg)imglist = re.findall(imgre,html)x = 0#print imglistfor imgurl in imglist:req = urllib2.Request(imgurl, headers = {'Connection': 'keep-alive','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36','Accept-Language': 'zh-CN,zh;q=0.8'})try:html = urllib2.urlopen(req,timeout = 300).read()# save_path = ''%s.jpg' % x'f_obj = open(preDir+'%s.jpg' % x, 'wb') # wb 表示打开方式f_obj.write(html)f_obj.close()#time.sleep(3000)x+=1except:print 'download img error %s \n ' % x# get all detail info in an html# ex: http://www.marinetraffic.com/en/ais/index/ships/all/sort:COUNT_PHOTOS/direction:desc/per_page:50/page:1def getDetail(html):reg = r'Show Details For: (.+?)" href="/en/ais/details/ships/shipid:([0-9]+?)/imo:([0-9]+?)/mmsi:([0-9]+?)/vessel:(.+?)">'imgre = re.compile(reg)boatDetail = re.findall(imgre,html)return boatDetail# save data to csv file#np.savetxt('testNew.txt',x,fmt=['%s']*x.shape[1],newline='\n')def save2csv(boatDetail,filename):boatDetail=np.array(boatDetail)np.savetxt(filename,boatDetail,fmt=['%s']*boatDetail.shape[1],newline='\n',delimiter=',')# read data from csv file# shipId as the index for url and MMSI for filenamedef readFcsv(filename):shipIds,mmsis = np.loadtxt(filename,dtype = 'int',delimiter=',',usecols=(1,3),unpack=True)return shipIds,mmsisallboatsUrl = 'http://www.marinetraffic.com/en/ais/index/ships/all/sort:COUNT_PHOTOS/direction:desc/per_page:50/page:'for pageInx in range(11,50):html = getHtml(allboatsUrl+str(pageInx))if html!=None:csvNum=pageInx+40save2csv(getDetail(html),'./boatDetail/%s.csv' % csvNum)print '开始下载地%s页船只的图像' % pageInxelse:print '下载第%s页的船只信息出错了' % pageInxcontinueshipids,mmsis =readFcsv('./boatDetail/%s.csv' % csvNum)shipIndex = 0for ship in shipids:curDir=str(mmsis[shipIndex])#os.mkdir(curDir)#print '跑一遍\n'preDir='./'+curDir+'/'imgUrl='http://www.marinetraffic.com/en/photos/of/ships/shipid:'+str(ship)+'/per_page:50'imgHtml=getHtml(imgUrl)if imgHtml!=None:os.mkdir(curDir)getImg(imgHtml,preDir)print 'mmsi 为'+curDir+'的船只数据下载完成\n'else:print 'mmsi为'+curDir+'的船只数据下载失败\n'shipIndex+=1
Python3爬虫 - Jecvay Notes
python实现简单爬虫功能
如何入门 Python 爬虫? - 知乎
你是如何开始能写python爬虫? - 知乎
Python的正则表达式中的圆括号到底如何使用? 知乎