@yaohang
2016-07-16T13:52:55.000000Z
字数 1522
阅读 710
未分类
from bs4 import BeautifulSoupimport requests, time,pymongo,refrom selenium import webdriverfrom multiprocessing import Poolimport requestsimport randomdef get_go2_all_links(url): #这是抓取的函数re = requests.get(url,headers=header,timeout=2,proxies=ip)re.encoding='utf-8'soup = BeautifulSoup(re.text,'lxml')links = soup.select('body > div.l1 > div > ul > li.list01.fy14.bold > a')if soup.select('body > div.l1 > div'): #判断有没有抓完for i in links:f = 'http://www.go2.cn' + i.get('href')try:data = {'links':'http://www.go2.cn'+i.get('href'),'time':time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) #保存抓取的时间}go2_all_links_test.insert_one(data)print(data)except pymongo.errors.DuplicateKeyError:passprint('此url已经入库{}'.format((f.split('/')[4]).split('.')[0]))else:print('已经抓完了')passvalid_ip=[]def chenk_ip(api): #这是获取代理IP的函数resp = requests.get(api)ips_txt = resp.text.strip().split("\r\n") #按换行符分割for i in ips_txt:ips={'http':i} #构造proxies能接受的固定格式try:wb_data=requests.get('http://www.go2.cn/',proxies=ips,timeout=2)if wb_data.status_code == requests.codes.ok: #验证返回结果是否为200print(ips,'可用')valid_ip.append(ips) #放进list里待用except (requests.exceptions.ReadTimeout,requests.exceptions.ConnectTimeout,requests.exceptions.ConnectionError,requests.exceptions.ChunkedEncodingError):print('IP有问题')passprint('本次可用IP:',len(valid_ip))if __name__ == '__main__':pool=Pool()start = time.time()chenk_ip('http://www.ip002.com/api?order=1680799938707969&num=10&hide=高匿')ipp = random.choice(valid_ip)for i in url_list:pool.apply_async(get_go2_all_links,(i,))pool.close()pool.join()end = time.time()print(end-start)