@Jack00878
2017-05-27T15:10:49.000000Z
字数 1608
阅读 411
爬虫
import requests
from bs4 import BeautifulSoup
import time
headers = {
'Cookie':'x-wl-uid=1SRHzp1DeLjlkeMZldo/fl0/pi+6PwdmcTqHVNEjSzZHR8tWXVA2SJN1KdjxPpA16b7Ka9WSs660=; session-token=ZHPj4ethp\
TikSMqY07XPcpqEQONe4LtgKG+h8bm3SeIB+c1gdNchtbU/Va6SF4M0frMpbir72wd1G/FRWeqiuWtSiXae8R/iP2Kkw6/1LAn/N/Wlm8Io4SLbZ8Tmpt/hw3jr\
5qOPhEyMvbDzkpRp9qFnF35tZ0DLdzGmqj2eb/QyBbKh92THKfdGDpNjNIu3Uq1UR/SvFHDzByPjS5EO+djExZcNPwNyKr4Oo47qFDs/qzVhrKYapg==; csm-hi\
t=s-WWCC13SV4AD8122QAJWY|1495877102575; ubid-acbcn=461-6927767-7570326; session-id-time=2082729601l; session-id=462-4221405-5775756',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.113 Safari/537.36'
}
wb_urls = ['https://www.amazon.cn/gp/bestsellers/books/ref=gwgfloorv1_BMVD_bsl_0?pf_rd_p=3399e1ca-b03f-4b01-8455-0c17dcc9273a&pf_\
rd_s=desktop-8&pf_rd_t=36701&pf_rd_i=desktop&pf_rd_m=A1AJ19PSB66TGU&pf_rd_r=C6GYSM9ENDNMS2XN1YNQ&pf_rd_r=C6GYSM9ENDNMS2XN1YNQ&pf_rd_p=3399e1ca-b03f-4b01-8455-0c17dcc9273a#{}'.format(str(i)) for i in range(1, 6, 1)]
#爬取amazon网页数据
def getBookDetail(wb_url):
time.sleep(4)
res = requests.get(wb_url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'lxml')
titles = soup.select('a.a-link-normal > div')
imgs = soup.select('img.a-thumbnail-left')
authors = soup.select('span.a-size-small.a-color-base')
prices = soup.select('span.p13n-sc-price')
for title, author, img, price in zip(titles, authors, imgs, prices):
data = {
'title': title.get_text().strip(),
'author': author.get_text(),
'img': img.get('src'),
'price': price.get_text()
}
print(data)
return 0
for single_url in wb_urls:
print(single_url)
#getBookDetail(single_url) ##只能爬取第一页