import urllib.request
import requests
import re
import time
from bs4 import BeautifulSoup
import random
headers = {
'Cookie': "_T_WM=72790643300; XSRF-TOKEN=ecaa96; WEIBOCN_FROM=1110006030; SUB=_2A25zx0yBDeRhGeFK71MR-S_OwjSIHXVRSFTJrDV6PUJbkdAKLVXzkW1NQ0tvKI1I2sVjg4nuE5v5eGU-wmim-w6a; SUHB=0eb9kMLYkT2xjK; SCF=Au9bkHrkzgoVu7Rg1Ga1FA7qoMmbNBkYZty58CvrhWLM3ywsXC_WYZAHiyZLo3d88ZnmJiWo35QC-h5cozFseg8.; SSOLoginState=1589853393; MLOGIN=1; M_WEIBOCN_PARAMS=luicode%3D20000174%26uicode%3D20000174",
'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko) Chrome/81.0.4044.122 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'close',
}
def get_news_id():
data = []
url = "http://society.people.com.cn/GB/369130/431577/index.html"
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
} # 为了防止被反爬虫
s=requests.session()
res =s.get(url, headers=headers,timeout=(3,7))
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
a = soup.find_all('div',class_="dtnews")
b=soup.find_all('href')
a=str(a)
pattern='<h2><a href="(.*?)" target="_blank">.*</a></h2>'
b=re.findall(pattern,a)
return b
def get_news(newsid):
f = open('F:\\人民网text.csv', 'a', encoding='utf-8')
col = '链接,正文,发布时间'
f.write(col + '\n')
ls=[]
count=1
for id in newsid:
print('-------------------------------------正在爬取第{}个疫情快讯新闻------------------------------------------------'.format(count))
count+=1
url="http://society.people.com.cn"+str(id)
res= requests.get(url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
#处理文章内容
#---------------------------------------------------------------------------------------------------------------------------------
newlist = soup.find_all('p',style="text-indent: 2em;")
print(url)
#print(newlist)
news=[]
for i in newlist:
i=str(i)
news.append(i)
text=[]
for j in news:
j = str(j)
pattern = '<p style="text-indent: 2em;">\n\t(.*?)</p>'
j = re.findall(pattern, j)
text+=j
anews=""#最终的新闻内容
for j in text:
anews+=j
#----------------------------------------------------------------------------------------------------------------------
if len(newlist)==0:#处理另一种网页
print("不符合")
newlist=soup.find_all('div',class_="rm_txt_con cf")
newlist=soup.find_all('p')
newlist=list(newlist[5])
for i in newlist:
anews=i
anews=anews.replace('\n','')
#print(anews)
#处理时间
pattern='/n1/(.*?)/c.*.html'
newtime=re.findall(pattern,str(id))
for i in newtime:
ntime=str(i)
f.write(url+','+anews+','+ntime+'\n')
l=[url]+[anews]+[ntime]
ls.append(l)
time.sleep(random.randint(3, 7) / 10)
f.close()
def get_news_two(newsid):
f = open('F:\\人民网text.csv', 'a', encoding='utf-8')
col = '链接,正文,发布时间'
f.write(col + '\n')
ls=[]
count=1
for id in newsid:
print('-------------------------------------正在爬取第{}个疫情快讯新闻------------------------------------------------'.format(count))
count+=1
url=str(id)
res= requests.get(url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.content, 'html.parser')
#处理文章内容
#---------------------------------------------------------------------------------------------------------------------------------
newlist = soup.find_all('p',style="text-indent: 2em;")
print(url)
#print(newlist)
news=[]
for i in newlist:
i=str(i)
news.append(i)
text=[]
for j in news:
j = str(j)
pattern = '<p style="text-indent: 2em;">\n\t(.*?)</p>'
j = re.findall(pattern, j)
text+=j
anews=""#最终的新闻内容
for j in text:
anews+=j
#----------------------------------------------------------------------------------------------------------------------
if len(newlist)==0:#处理另一种网页
print("不符合")
newlist=soup.find_all('div',class_="rm_txt_con cf")
newlist=soup.find_all('p')
newlist=list(newlist[5])
for i in newlist:
anews=i
anews=anews.replace('\n','')
#print(anews)
#处理时间
pattern='/n1/(.*?)/c.*.html'
newtime=re.findall(pattern,str(id))
for i in newtime:
ntime=str(i)
f.write(url+','+anews+','+ntime+'\n')
l=[url]+[anews]+[ntime]
ls.append(l)
time.sleep(random.randint(3, 7) / 10)
f.close()
newsid = get_news_id()
print('完成newid获取共', len(newsid))
newsid_two=[]
for i in newsid:
pattern="http://legal.people.com.cn.*"
if re.match(pattern,i):
newsid.remove(i)
newsid_two.append(i)
#print(newsid)
get_news(newsid[68::])
print('已保存')
#print(newsid_two)
所爬取的疫情新闻网站
所爬取数据部分展示