[关闭]
@yaohang 2016-07-16T13:52:55.000000Z 字数 1522 阅读 710

在此处输入标题

未分类


  1. from bs4 import BeautifulSoup
  2. import requests, time,pymongo,re
  3. from selenium import webdriver
  4. from multiprocessing import Pool
  5. import requests
  6. import random
  7. def get_go2_all_links(url): #这是抓取的函数
  8. re = requests.get(url,headers=header,timeout=2,proxies=ip)
  9. re.encoding='utf-8'
  10. soup = BeautifulSoup(re.text,'lxml')
  11. links = soup.select('body > div.l1 > div > ul > li.list01.fy14.bold > a')
  12. if soup.select('body > div.l1 > div'): #判断有没有抓完
  13. for i in links:
  14. f = 'http://www.go2.cn' + i.get('href')
  15. try:
  16. data = {
  17. 'links':'http://www.go2.cn'+i.get('href'),
  18. 'time':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #保存抓取的时间
  19. }
  20. go2_all_links_test.insert_one(data)
  21. print(data)
  22. except pymongo.errors.DuplicateKeyError:
  23. pass
  24. print('此url已经入库{}'.format((f.split('/')[4]).split('.')[0]))
  25. else:
  26. print('已经抓完了')
  27. pass
  28. valid_ip=[]
  29. def chenk_ip(api): #这是获取代理IP的函数
  30. resp = requests.get(api)
  31. ips_txt = resp.text.strip().split("\r\n") #按换行符分割
  32. for i in ips_txt:
  33. ips={'http':i} #构造proxies能接受的固定格式
  34. try:
  35. wb_data=requests.get('http://www.go2.cn/',proxies=ips,timeout=2)
  36. if wb_data.status_code == requests.codes.ok: #验证返回结果是否为200
  37. print(ips,'可用')
  38. valid_ip.append(ips) #放进list里待用
  39. except (requests.exceptions.ReadTimeout,requests.exceptions.ConnectTimeout,requests.exceptions.ConnectionError,requests.exceptions.ChunkedEncodingError):
  40. print('IP有问题')
  41. pass
  42. print('本次可用IP:',len(valid_ip))
  43. if __name__ == '__main__':
  44. pool=Pool()
  45. start = time.time()
  46. chenk_ip('http://www.ip002.com/api?order=1680799938707969&num=10&hide=高匿')
  47. ipp = random.choice(valid_ip)
  48. for i in url_list:
  49. pool.apply_async(get_go2_all_links,(i,))
  50. pool.close()
  51. pool.join()
  52. end = time.time()
  53. print(end-start)
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注