Dang!Dang!Dang!第一只爬虫顺利诞生,看得出有很多不足和繁琐的地方!
实现功能:
对给定人民网新闻页面进行标题,时间和来源的抓取,并找出该网页的推荐新闻.
效果如图:
该网址为:http://politics.people.com.cn/n1/2018/0101/c1001-29738654.html
代码如下:
import requests
from bs4 import BeautifulSoup
def url(newurl):#对该新闻标题,来源,时间的抓取
news={}
res=requests.get(newurl)
res.encoding='GB2312'
soup=BeautifulSoup(res.text,'lxml')
for link in soup.select('head'):
if len(link.select('title'))>0:
news['title']=link.select('title')[0].text
for lin in link.select('meta'):
stra=str(lin).split(' ')
stra[-1]=stra[-1].lstrip('name="').rstrip('"/>')
if(stra[-1]=='source'):
stra[-2]=stra[-2].lstrip('content="').rstrip('"')
news['source']=stra[-2]
if(stra[-1]=='publishdate'):
stra[-2]=stra[-2].lstrip('content="').rstrip('"')
news['date']=stra[-2]
return news
def pop(a,b,c):#输出三个信息
print(a)
print(b)
print(c)
def url_hot(newurl):#找到推荐新闻并输出标题
res=requests.get(newurl)
res.encoding='GB2312'
soup=BeautifulSoup(res.text,'lxml')
for link in soup.select('#rwb_rdtj'):
for lin in link.select('li'):
print(lin.text)
a='http://politics.people.com.cn/n1/2018/0101/c1001-29738654.html'
ne={}
ne=url(a)
#json.dumps(ne, encoding="UTF-8", ensure_ascii=False)
pop(ne['title'],ne['source'],ne['date'])
print('The page is recommended as follows:')#'页面推荐如下:'
url_hot(a);