Python 爬虫现阶段用到的包是requests还有BeautifulSoup4。
requests主要内容是模拟html的get方式读取网页的信息
BeautifulSoup4则是提取网页中tag标签里面的某些特定信息
范例:
import requests #import requests包res=requests.get('http://news.sina.com.cn/china/') #用get的方式抓取http://news.sina.com.cn/china/上的内容
res.encoding='utf-8 ' #指定用utf-8来解码
print(res.text) #print内容
from bs4 import BeautifulSoup #import BeautifulSoup4套件
html_sample='\
<html>\
<body>\
<h1 id="title">Hello World</h1>\
<a href="#" class="link">This is link1</a>\
<a href="# link2" class="link">This is link2</a>\
</body>\
</html>'
#sample html网页source code
soup=BeautifulSoup(html_sample, 'html.parser')
print(soup.text)
爬取sina新闻网页内容范例:
import requests
from bs4 import BeautifulSoup
res=requests.get('http://news.sina.com.cn/china/')
res.encoding='utf-8'
soup=BeautifulSoup(res.text,'html.parser')
for news in soup.select('.news-item'):
if len(news.select('h2'))>0:
h2=news.select('h2')[0].text
a=news.select('a')[0]['href']
time=news.select('.time')[0].text
print(time,h2,a)
strip()函数可以去除一些不想要的字段或者空格
join()函数可以合并一些东西
from datetime import datetime
dt=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
从datetime模块中import datetime组件
可以把str的字符串转化成时间,可以自动定义格式
如果想要的东西是以json格式陷进去文章里面,需要用到json模块
抓取新浪网新闻评论数范例:
先找出评论的网页
import requests
import json
comments=requests.get('http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=\
gn&newsid=comos-fyamkqa6031838&group=&compress=0&ie=utf-8&oe=utf-8&\
page=1&page_size=20&jsvar=loader_1487053268732_20451525')
jd=json.loads(comments.text.strip('var=loader_1487053268732_20451525' 'var data='))
jd['result']['count']['total']
找出一则新闻的信息,包括标题,时间,正文,编辑等等,先定义一个function
def getNewsDetail(newsurl):
res=requests.get(newsurl)
res.encoding="utf-8"
soup=BeautifulSoup(res.text, "html.parser")
newsid=re.search('doc-i(.+).shtml',newsurl).group(1)
title=soup.select('#artibodyTitle')[0].text
timesource=datetime.strptime(soup.select('.time-source')[0].contents[0].strip(),'%Y年%m月%d日%H:%M')
time=datetime.strftime(timesource,'%Y-%m-%d:%H:%M')
medianame=soup.select('.time-source span a')[0].text
body=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
editor=re.search('责任编辑:(.+) ',soup.select('.article-editor')[0].text).group(1)
commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'
comments=str(json.loads(requests.get(commentURL.format(newsid)).text.strip('var data='))['result']['count']['total'])
result=['标题:'+title, '时间:'+time, '来源:'+medianame, '正文:'+body, '编辑:'+editor, '评论数:'+comments]
return result
getNewsDetail('http://news.sina.com.cn/o/2017-02-14/doc-ifyamvns5229031.shtml')
得到多页新闻标记+链接:
def getNewsInfo(till_page):
newsURL='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1487069108723'
info=[]
for page_number in range(1,(till_page+1)):
newspage=newsURL.format(page_number)
res=requests.get(newspage)
jd=json.loads(res.text.strip(' newsloadercallback(' ');'))
for news in jd['result']['data']:
info.append([news['title'],news['url']])
return info
getNewsInfo(2)
得到多页新闻信息:
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import re
import pandas
divisionURL='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&callback=newsloadercallback&_=1487122877965'
result=[]
for page_number in range(1,3):
newspage=divisionURL.format(page_number)
res=requests.get(newspage)
jd=json.loads(res.text.strip(' newsloadercallback(' ');'))
for newsurl in jd['result']['data']:
url=(newsurl['url'])
newsres=requests.get(url)
newsres.encoding="utf-8"
soup=BeautifulSoup(newsres.text, "html.parser")
title=soup.select('#artibodyTitle')[0].text
timesource=datetime.strptime(soup.select('.time-source')[0].contents[0].strip(),'%Y年%m月%d日%H:%M')
time=datetime.strftime(timesource,'%Y-%m-%d:%H:%M')
medianame=soup.select('.time-source span a')[0].text
editor=re.search('责任编辑:(.+) ',soup.select('.article-editor')[0].text).group(1)
result.append(('标题:'+title, '链接:'+url, '时间:'+time, '来源:'+medianame, '编辑:'+editor))
result