Python 编写新浪新闻网络爬虫
源码:
import requests
import re
import json
import pandas
from bs4 import BeautifulSoup
from datetime import datetime
commentURL = 'http://comment5.news.sina.com.cn/page/info?\
version=1&format=js&channel=gn&newsid=comos-{}&\
group=&compress=0&ie=utf-8&oe=utf-8&page=1&page_size=20'#一则新闻的评论数所在的链接,此链接将newsid挖空,以便动态获取该则新闻的评论数
#获取一则新闻的评论数
def getCommentCounts(newsurl):
newsid = (re.search('doc-i(.*).shtml',newsurl)).group(1)#从某个新闻网页获取newsid
comments = requests.get(commentURL.format(newsid))
jd = json.loads(comments.text.strip('var data='))
return jd['result']['count']['total']
#得到一则新闻的所有相关信息
def getNewsDetail(newsurl):
result = {}
res = requests.get(newsurl)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text,'html.parser')
result['title'] = soup.select('#artibodyTitle')[0].text
timesource = soup.select('.time-source')[0].contents[0].strip()
result['time'] = datetime.strptime(timesource,'%Y年%m月%d日%H:%M')
result['comments'] = getCommentCounts(newsurl)
result['article'] = ' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])
result['editor'] = soup.select('#artibody p')[-1].text.lstrip('责任编辑:')
result['newssource'] = soup.select('.time-source span a')[0].text
return result
#从每一页的所有新闻链接获取新闻信息
def parseListLinks(url):
newsdetails = []
res = requests.get(url)
jd = json.loads(res2.text.lstrip(' newsloadercallback(').rstrip(');'))
for i in jd['result']['data']:
newsdetails.append(getNewsDetail(i['url']))
return newsdetails
url = 'http://api.roll.news.sina.com.cn/zt_list?channel=news&\
cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&\
show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}&\
callback=newsloadercallback&_=1505360465046'#此链接包含有该页所有新闻的链接,挖空page,以便动态获取某一页的所有新闻链接
#从规定页数获取新闻信息
def getPageNews(first,last):
news_total = []
for i in range(first,last):
newsurl = url.format(i)
newsary = parseListLinks(newsurl)
news_total.extend(newsary)
return news_total
#以表格形式输出
news_total = getPageNews(1,3)
df = pandas.DataFrame(news_total)
df
相关知识链接:
requests:点击打开链接
BeautifulSoup:点击打开链接
re:点击打开链接
Pandas:点击打开链接