[关闭]
@Jack00878 2017-05-27T15:10:49.000000Z 字数 1608 阅读 411

爬取amazon图书榜单

爬虫


  1. import requests
  2. from bs4 import BeautifulSoup
  3. import time
  4. headers = {
  5. 'Cookie':'x-wl-uid=1SRHzp1DeLjlkeMZldo/fl0/pi+6PwdmcTqHVNEjSzZHR8tWXVA2SJN1KdjxPpA16b7Ka9WSs660=; session-token=ZHPj4ethp\
  6. TikSMqY07XPcpqEQONe4LtgKG+h8bm3SeIB+c1gdNchtbU/Va6SF4M0frMpbir72wd1G/FRWeqiuWtSiXae8R/iP2Kkw6/1LAn/N/Wlm8Io4SLbZ8Tmpt/hw3jr\
  7. 5qOPhEyMvbDzkpRp9qFnF35tZ0DLdzGmqj2eb/QyBbKh92THKfdGDpNjNIu3Uq1UR/SvFHDzByPjS5EO+djExZcNPwNyKr4Oo47qFDs/qzVhrKYapg==; csm-hi\
  8. t=s-WWCC13SV4AD8122QAJWY|1495877102575; ubid-acbcn=461-6927767-7570326; session-id-time=2082729601l; session-id=462-4221405-5775756',
  9. 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.113 Safari/537.36'
  10. }
  11. wb_urls = ['https://www.amazon.cn/gp/bestsellers/books/ref=gwgfloorv1_BMVD_bsl_0?pf_rd_p=3399e1ca-b03f-4b01-8455-0c17dcc9273a&pf_\
  12. rd_s=desktop-8&pf_rd_t=36701&pf_rd_i=desktop&pf_rd_m=A1AJ19PSB66TGU&pf_rd_r=C6GYSM9ENDNMS2XN1YNQ&pf_rd_r=C6GYSM9ENDNMS2XN1YNQ&pf_rd_p=3399e1ca-b03f-4b01-8455-0c17dcc9273a#{}'.format(str(i)) for i in range(1, 6, 1)]
  13. #爬取amazon网页数据
  14. def getBookDetail(wb_url):
  15. time.sleep(4)
  16. res = requests.get(wb_url, headers=headers)
  17. res.encoding = 'utf-8'
  18. soup = BeautifulSoup(res.text, 'lxml')
  19. titles = soup.select('a.a-link-normal > div')
  20. imgs = soup.select('img.a-thumbnail-left')
  21. authors = soup.select('span.a-size-small.a-color-base')
  22. prices = soup.select('span.p13n-sc-price')
  23. for title, author, img, price in zip(titles, authors, imgs, prices):
  24. data = {
  25. 'title': title.get_text().strip(),
  26. 'author': author.get_text(),
  27. 'img': img.get('src'),
  28. 'price': price.get_text()
  29. }
  30. print(data)
  31. return 0
  32. for single_url in wb_urls:
  33. print(single_url)
  34. #getBookDetail(single_url) ##只能爬取第一页
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注