[关闭]
@dragonfive 2016-03-16T03:24:29.000000Z 字数 3568 阅读 571

Python写爬虫

python编程


用Python抓取指定页面:

  1. #encoding:UTF-8
  2. import urllib
  3. import urllib2
  4. url = "http://www.baidu.com"
  5. data = urllib2.urlopen(url).read()
  6. #data = data.decode('UTF-8')
  7. print(data)

对网页内容使用正则表达式进行匹配

  1. def getImg(html):
  2. reg = r'src="(.+?\.jpg)" pic_ext'
  3. imgre = re.compile(reg)
  4. imglist = re.findall(imgre,html)
  5. return imglist

1 取的是()里的模式
2 .+?表示匹配任意长度的字符串

GET / HTTP/1.1
Accept: text/html, application/xhtml+xml, image/jxr, /
Accept-Language: zh-Hans-CN,zh-Hans;q=0.5
User-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko
Accept-Encoding: gzip, deflate
DNT: 1
Host: www.baidu.com
Connection: Keep-Alive
Cookie: BAIDUID=2C7FF7888B4CED50341979D818CCE4D4:FG=1; BIDUPSID=2C7FF7888B4CED50341979D818CCE4D4; PSTM=1457514949

下载marintraffic上面的船只图像

用python写爬虫,需要注意假冒成浏览器的样子

冒充浏览器需要写入header

用到抓包程序fiddler

程序逻辑

首先下载网站若干船只的所有信息到一个对象列表里面保存下来
然后对每一个船只MMSI下载若干个船

发现问题

图片方面:前面一些船宽度是300的 后来下载的改成800
csv方面:前面50个是10个一个文件的,后来发现网站的索引最多只支持50

python代码

  1. #coding=utf-8
  2. import urllib
  3. import re
  4. import urllib2
  5. import time
  6. import numpy as np
  7. import os
  8. # get an html
  9. def getHtml(url):
  10. req = urllib2.Request(url, headers = {
  11. 'Connection': 'keep-alive',
  12. 'Accept': '*/*',
  13. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
  14. 'Accept-Language': 'zh-CN,zh;q=0.8'
  15. })
  16. try:
  17. oper = urllib2.urlopen(req,timeout = 300)
  18. html = oper.read()
  19. return html
  20. except:
  21. return None
  22. # get all images in an html
  23. def getImg(html,preDir):
  24. reg = r"data-original='(.+?)0' data-title"
  25. imgre = re.compile(reg)
  26. imglist = re.findall(imgre,html)
  27. x = 0
  28. #print imglist
  29. for imgurl in imglist:
  30. req = urllib2.Request(imgurl, headers = {
  31. 'Connection': 'keep-alive',
  32. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  33. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36',
  34. 'Accept-Language': 'zh-CN,zh;q=0.8'
  35. })
  36. try:
  37. html = urllib2.urlopen(req,timeout = 300).read()
  38. # save_path = ''%s.jpg' % x'
  39. f_obj = open(preDir+'%s.jpg' % x, 'wb') # wb 表示打开方式
  40. f_obj.write(html)
  41. f_obj.close()
  42. #time.sleep(3000)
  43. x+=1
  44. except:
  45. print 'download img error %s \n ' % x
  46. # get all detail info in an html
  47. # ex: http://www.marinetraffic.com/en/ais/index/ships/all/sort:COUNT_PHOTOS/direction:desc/per_page:50/page:1
  48. def getDetail(html):
  49. reg = r'Show Details For: (.+?)" href="/en/ais/details/ships/shipid:([0-9]+?)/imo:([0-9]+?)/mmsi:([0-9]+?)/vessel:(.+?)">'
  50. imgre = re.compile(reg)
  51. boatDetail = re.findall(imgre,html)
  52. return boatDetail
  53. # save data to csv file
  54. #np.savetxt('testNew.txt',x,fmt=['%s']*x.shape[1],newline='\n')
  55. def save2csv(boatDetail,filename):
  56. boatDetail=np.array(boatDetail)
  57. np.savetxt(filename,boatDetail,fmt=['%s']*boatDetail.shape[1],newline='\n',delimiter=',')
  58. # read data from csv file
  59. # shipId as the index for url and MMSI for filename
  60. def readFcsv(filename):
  61. shipIds,mmsis = np.loadtxt(filename,dtype = 'int',delimiter=',',usecols=(1,3),unpack=True)
  62. return shipIds,mmsis
  63. allboatsUrl = 'http://www.marinetraffic.com/en/ais/index/ships/all/sort:COUNT_PHOTOS/direction:desc/per_page:50/page:'
  64. for pageInx in range(11,50):
  65. html = getHtml(allboatsUrl+str(pageInx))
  66. if html!=None:
  67. csvNum=pageInx+40
  68. save2csv(getDetail(html),'./boatDetail/%s.csv' % csvNum)
  69. print '开始下载地%s页船只的图像' % pageInx
  70. else:
  71. print '下载第%s页的船只信息出错了' % pageInx
  72. continue
  73. shipids,mmsis =readFcsv('./boatDetail/%s.csv' % csvNum)
  74. shipIndex = 0
  75. for ship in shipids:
  76. curDir=str(mmsis[shipIndex])
  77. #os.mkdir(curDir)
  78. #print '跑一遍\n'
  79. preDir='./'+curDir+'/'
  80. imgUrl='http://www.marinetraffic.com/en/photos/of/ships/shipid:'+str(ship)+'/per_page:50'
  81. imgHtml=getHtml(imgUrl)
  82. if imgHtml!=None:
  83. os.mkdir(curDir)
  84. getImg(imgHtml,preDir)
  85. print 'mmsi 为'+curDir+'的船只数据下载完成\n'
  86. else:
  87. print 'mmsi为'+curDir+'的船只数据下载失败\n'
  88. shipIndex+=1

未完但未必会继续

参考资料

Python3爬虫 - Jecvay Notes
python实现简单爬虫功能
如何入门 Python 爬虫? - 知乎
你是如何开始能写python爬虫? - 知乎
Python的正则表达式中的圆括号到底如何使用? 知乎

添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注