[关闭]
@hainingwyx 2018-04-26T08:37:05.000000Z 字数 8674 阅读 1124

爬虫基本内容

Python 爬虫


正则表达式

findall

描述:匹配所有符合规律的内容,返回包含结果的列表

表达式 功能
re.findall('x.', code) 返回两个长度的字符串组成的列表
re.findall('x*', code) 返回字符'x'的匹配列表,不匹配为空字符串
re.findall('xx.*x', code) 返回以xx开始和以x为结束的最大字符串组成的列表
re.findall('xx.*?xx', code) 返回以xx开始和以xx为结束的所有字符串组成的列表
re.findall('xx(.*?)xx', code) 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行重新开始
re.findall('xx(.*?)xx', code, re.S) 返回所有xx开始和xx结束中间的所有内容为字符串组成的列表,换行不影响
re.findall('xx(.?)xx(.?)xx', code) 返回满足该格式的两个字符串组成的元祖列表
re.findall('(\d+)', code) 返回所有纯数字组成的字符串列表

描述:search 匹配并提取第一个符合规律的内容,返回一个正则表达式对象

表达式 功能
re.search('xx(.?)xx(.?)xx', code) 返回对象
re.search('xx(.?)xx(.?)xx', code).group(1) 返回符合该格式的第1个字符串,2则为第二个字符串

sub

描述:sub 替换符合规律的内容,返回替换后的值

表达式 功能
re.sub('xx(.*?)xx', 'bilibili', code) 引号内的内容用新字符串
n = re.sub('xx(.*?)xx', 'bili%d'%123, code) 引号内的内容用新字符串+数字代替
  1. # 只有一个title的情况下
  2. title = re.search('<title>(.*?)</title>', html, re.S).group(1)
  3. # 爬链接
  4. links = re.findall("<a href = '(.*?)'>", html, re.S)

实战

  1. # 爬慕课网上前端开发课程中关于JS的所有课程封面图
  2. import re, requests
  3. link = "http://www.imooc.com/course/list?c=javascript&page=1"
  4. container = []
  5. for i in range(1, 4):
  6. new_link = re.sub('page=\d+', 'page=%d'%i, link)
  7. html = requests.get(new_link)
  8. field = re.findall('<div class="moco-course-wrap">(.*?)</div>', html.text, re.S)
  9. for field_1 in field:
  10. pic_links = re.findall('src="(.*?)" height="124"', field_1, re.S)
  11. if len(pic_links) != 0:
  12. container.append(pic_links[0])
  13. # 需要事先在父目录中建立好文件夹"pic from imooc"
  14. num = 1
  15. for pic_link in container:
  16. print 'Downloading...', pic_link
  17. pic = requests.get(pic_link)
  18. save = open('pic from imooc\\' + str(num) + '.jpg', 'wb')
  19. save.write(pic.content)
  20. save.close()
  21. num += 1
  1. ##爬日语学习的标题
  2. import requests
  3. html = requests.get('http://tieba.baidu.com/f?ie=utf-8&kw=%E6%97%A5%E6%9C%AC')
  4. #print html.text
  5. # 通过Network获取headers
  6. headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
  7. html = requests.get('http://jp.tingroom.com/rumen/ryrumen/', headers = headers)
  8. html.encoding = 'utf-8'
  9. #print html.text
  10. field = re.findall('<li style=" font-size:14px;(.*?)</li>', html.text, re.S)
  11. for item in field:
  12. title = re.search('style="color: #039;">(.*?)</a>', item, re.S).group(1)
  13. sub_title = re.search('style="color:#666666;">(.*?)</span>', item, re.S).group(1)
  14. print format(title, '40'), '\t', sub_title
  1. # 爬取极客学院课程信息
  2. import requests, re
  3. class spider(object):
  4. def source(self, url):
  5. html = requests.get(url)
  6. return html.text
  7. def pages(self, url, total_page):
  8. now_page = int(re.search('pageNum=(\d+)', url, re.S).group(1))
  9. page_group = []
  10. for i in range(now_page, total_page + 1):
  11. link = re.sub('pageNum=\d+', 'pageNum=%s' %i, url, re.S)
  12. page_group.append(link)
  13. return page_group
  14. def get_class(self, source):
  15. every_class = re.findall('style="height: 88px;">(.*?)</div>', source, re.S)
  16. return every_class
  17. def getinfo(self, eachclass):
  18. info = {}
  19. info['title'] = re.findall('>(.*?)</a></h2>', eachclass, re.S)[0].split('>')[1]
  20. info['intro'] = re.search('display: none;">(.*?)</p>', eachclass, re.S).group(1)
  21. info['people'] = re.search('<em class="learn-number">(.*?)</em>', eachclass, re.S).group(1).strip(" ")
  22. detail_field = re.findall('<em>(.*?)</em>', eachclass, re.S)
  23. info['time'] = detail_field[0].strip(" ")
  24. info['level'] = detail_field[1].strip(" ")
  25. return info
  26. def saveinfo(self, classinfo):
  27. f = open('jikexueyuan.txt', 'a')
  28. for each in classinfo:
  29. f.writelines('title: ' + each['title'].encode('utf-8') + '\n')
  30. f.writelines('content: ' + each['intro'].strip('\n').strip(' ').encode('utf-8') + '\n')
  31. f.writelines('learn_num: ' + each['people'].encode('utf-8') + '\n')
  32. f.writelines('classtime: ' + each['time'].strip('\n').encode('utf-8') + '\n')
  33. f.writelines('classlevel: ' + each['level'].encode('utf-8') + '\n\n')
  34. f.close()
  35. if __name__ == "__main__":
  36. classinfo = []
  37. url = "http://www.jikexueyuan.com/course/?pageNum=1"
  38. spider_new = spider()
  39. pages = int(raw_input('How many pages do you want? '))
  40. all_links = spider_new.pages(url, pages)
  41. for link in all_links:
  42. print '正在爬取……' + link
  43. html = spider_new.source(link)
  44. everyclass = spider_new.get_class(html)
  45. for each in everyclass:
  46. info = spider_new.getinfo(each)
  47. classinfo.append(info)
  48. spider_new.saveinfo(classinfo)

Beautiful Soup

BeautifulSoup优势

  1. from bs4 import BeautifulSoup
  2. f = open('test.html', 'r')
  3. content = f.read()
  4. soup = BeautifulSoup(content, 'html.parser')
  5. print soup
  6. print soup.prettify() #输出格式美化
  7. #select方法:class对应点号;id对应井号
  8. soup.select('.sister') #返回class为sister所组成的列表
  9. soup.select('#link1') #返回id为link1所组成的列表
  10. soup.select('#link1')[0]['href'] # 提取链接 ['href']
  11. soup.select('#link1')[0].text # 提取文字 .text
  12. soup('p')
  13. soup.find_all('p') #两者等价
  14. for text in [item.text for item in soup('p')]: print text
  15. soup('head')
  16. soup('body')
  17. soup('title')
  18. soup.title.text
  19. soup.title.name #打印属性名称
  20. for tag in soup.find_all(True):
  21. print tag.name
  22. # 只返回第一次遇见的值
  23. soup.p
  24. soup.find_all('p') #返回多次所遇见的值
  25. soup.p['class']
  26. soup.find_all('p', {"class": "story"}) #返回多次p开头,class为story
  27. soup.find_all('p', {'class': 'story'})[0].find_all('a')
  28. soup.a
  29. soup.find_all('a', {'id': 'link3'})
  30. soup.find(id = 'link3')
  31. soup.find_all(['a', 'b']) #返回开头为a或者b的列表
  32. soup.get_text().split('\n')
  1. # 微信公众号信息爬取
  2. url = 'http://mp.weixin.qq.com/s?__biz=MzIxNTQ4NzAwNA==&mid=2247484008&idx=1&sn=dfa8a4a371dfbf4c2aa33574b2e99a25&scene=1&\
  3. srcid=0824cvxNVK6X3pDWyK08Byhj#rd'
  4. content = requests.get(url)
  5. soup = BeautifulSoup(content.text, 'html.parser')
  6. print soup.find('h2', {'class': 'rich_media_title'}).text.strip()
  7. print soup.find('div', {'class': 'rich_media_meta_list'}).text.strip().replace('\n', ' ')
  8. print soup.find('em').text
  9. print soup.find('div', {'class': 'rich_media_content'}).text
  10. # 基本信息
  11. field = soup.find('div', {'class': 'rich_media_meta_list'})
  12. title = soup.find('h2', {'class': 'rich_media_title'}).get_text().strip()
  13. date = field.find(id = 'post-date').get_text()
  14. content = soup.find('div', {'class': 'rich_media_content'}).get_text()
  15. print title
  16. print date
  17. print content

XPath

安装lxml:pip install lxml

HTML

XPath写法

  1. from lxml import etree
  2. html = open('demo_1.html', 'r').read()
  3. selector = etree.HTML(html)
  4. content = selector.xpath('/ /ul[@id="good"]/li/text()')
  5. for i in content:
  6. print i
  7. content_1 = selector.xpath('//ul/li/text()')
  8. for i in content_1:
  9. print i
  10. link = selector.xpath('//a/@href')
  11. for i in link:
  12. print i
  1. from lxml import etree
  2. html = open('demo_2.html').read()
  3. print html
  4. selector = etree.HTML(html)
  5. content_1 = selector.xpath('//body/div[1]/text()')
  6. for content in content_1: print content
  7. attr_2 = selector.xpath('//body/div[2]/@id')
  8. for attr in attr_2: print attr
  9. content = selector.xpath('//div[starts-with(@id, "test")]/text()')
  10. for item in content: print item
  11. attrs = selector.xpath('//div[starts-with(@id, "test")]/@id')
  12. for attr in attrs: print attr
  13. html = open('demo_3.html', 'r').read()
  14. print html
  15. selector = etree.HTML(html)
  16. content = selector.xpath('//div[starts-with(@id, "test")]/text()')
  17. for i in content: print i
  18. selector = etree.HTML(html)
  19. content = selector.xpath('//span[starts-with(@id, "test")]/text()')
  20. for i in content: print i
  21. field = selector.xpath('//div[@id="test"]')[0]
  22. content = field.xpath('string(.)')
  23. print content.replace('\n', '')

实战

  1. # 爬取百度贴吧中“Python吧”前50页的内容
  2. # coding:utf-8
  3. import requests
  4. from lxml import etree
  5. url = 'http://tieba.baidu.com/f?kw=python&ie=utf-8&pn='
  6. lst = []
  7. url_1 = 'http://tieba.baidu.com'
  8. page = int(raw_input(u'输入50的倍数:'))
  9. for i in range(0, page, 50):
  10. lst.append(url + str(i))
  11. with open('level_1.txt', 'a') as f:
  12. for item in lst:
  13. link, time = ' ', ' '
  14. print u'正在抓取...' + str(item)
  15. html_1 = requests.get(item)
  16. selector_1 = etree.HTML(html_1.text)
  17. field = selector_1.xpath('//div[@class="t_con cleafix"]')
  18. for each in field:
  19. title = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[0].strip()
  20. reply_num = each.xpath('div[@class="col2_left j_threadlist_li_left"]/span/text()')[0]
  21. author = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_author pull_right"]/span/@title')[0].split(':')[1]
  22. try:
  23. time = each.xpath('div[@class="col2_right j_threadlist_li_right "]/div')[0].xpath('string(.)').strip().split('\n')[3].strip('\n')
  24. except Exception, e:
  25. print e
  26. time = ' '
  27. if len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')) != 0:
  28. link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit "]/a/@href')[0]
  29. elif len(each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')) != 0:
  30. link = url_1 + each.xpath('div[@class="col2_right j_threadlist_li_right "]/div[@class="threadlist_lz clearfix"]/div[@class="threadlist_title pull_left j_th_tit member_thread_title_frs "]/a/@href')[0]
  31. f.write(title.encode('utf-8') + '\t' + str(reply_num) + '\t' + str(link) + '\t' + author.encode('utf-8') + '\t' + time.encode('utf-8'))
  32. f.write('\n')
  33. # 作为抓取提示标识
  34. print time, title
添加新批注
在作者公开此批注前,只有你和作者可见。
回复批注