爬取人民疫情快讯新闻

最新推荐文章于 2023-02-18 14:52:32 发布

不会绑马尾的女孩

最新推荐文章于 2023-02-18 14:52:32 发布

阅读量748

点赞数 1

分类专栏： python学习文章标签：爬虫 python

本文链接：https://blog.csdn.net/qq_45655136/article/details/112757154

版权

python学习专栏收录该内容

13 篇文章 0 订阅

订阅专栏

import urllib.request
import requests
import re
import time
from bs4 import BeautifulSoup
import random
headers = {
    'Cookie': "_T_WM=72790643300; XSRF-TOKEN=ecaa96; WEIBOCN_FROM=1110006030; SUB=_2A25zx0yBDeRhGeFK71MR-S_OwjSIHXVRSFTJrDV6PUJbkdAKLVXzkW1NQ0tvKI1I2sVjg4nuE5v5eGU-wmim-w6a; SUHB=0eb9kMLYkT2xjK; SCF=Au9bkHrkzgoVu7Rg1Ga1FA7qoMmbNBkYZty58CvrhWLM3ywsXC_WYZAHiyZLo3d88ZnmJiWo35QC-h5cozFseg8.; SSOLoginState=1589853393; MLOGIN=1; M_WEIBOCN_PARAMS=luicode%3D20000174%26uicode%3D20000174",
    'User-Agent': 'Mozilla/5.0(Windows NT 10.0;Win64;x64)AppleWebKit/537.36(KHTML,like Gecko) Chrome/81.0.4044.122 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
    'Connection': 'close',
    }

def get_news_id():
    data = []
    url = "http://society.people.com.cn/GB/369130/431577/index.html"
    headers = {
        'user-agent': 'Mozilla/5.0 (Windows NT 10.0;Win64;x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }  # 为了防止被反爬虫
    s=requests.session()
    res =s.get(url, headers=headers,timeout=(3,7))
    res.encoding = 'utf-8'
    soup = BeautifulSoup(res.content, 'html.parser')
    a = soup.find_all('div',class_="dtnews")
    b=soup.find_all('href')
    a=str(a)
    pattern='<h2><a href="(.*?)" target="_blank">.*</a></h2>'
    b=re.findall(pattern,a)
    return b
def get_news(newsid):
    f = open('F:\\人民网text.csv', 'a', encoding='utf-8')
    col = '链接,正文,发布时间'
    f.write(col + '\n')
    ls=[]
    count=1
    for id in newsid:
        print('-------------------------------------正在爬取第{}个疫情快讯新闻------------------------------------------------'.format(count))
        count+=1
        url="http://society.people.com.cn"+str(id)
        res= requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.content, 'html.parser')
#处理文章内容
#---------------------------------------------------------------------------------------------------------------------------------
        newlist = soup.find_all('p',style="text-indent: 2em;")
        print(url)
        #print(newlist)
        news=[]
        for i in newlist:
            i=str(i)
            news.append(i)
        text=[]
        for j in news:
            j = str(j)
            pattern = '<p style="text-indent: 2em;">\n\t(.*?)</p>'
            j = re.findall(pattern, j)
            text+=j
        anews=""#最终的新闻内容
        for j in text:
            anews+=j
#----------------------------------------------------------------------------------------------------------------------
        if len(newlist)==0:#处理另一种网页
           print("不符合")
           newlist=soup.find_all('div',class_="rm_txt_con cf")
           newlist=soup.find_all('p')
           newlist=list(newlist[5])
           for i in newlist:
              anews=i
           anews=anews.replace('\n','')
           #print(anews)
#处理时间
        pattern='/n1/(.*?)/c.*.html'
        newtime=re.findall(pattern,str(id))
        for i in newtime:
            ntime=str(i)
        f.write(url+','+anews+','+ntime+'\n')
        l=[url]+[anews]+[ntime]
        ls.append(l)
        time.sleep(random.randint(3, 7) / 10)
    f.close()
def get_news_two(newsid):
    f = open('F:\\人民网text.csv', 'a', encoding='utf-8')
    col = '链接,正文,发布时间'
    f.write(col + '\n')
    ls=[]
    count=1
    for id in newsid:
        print('-------------------------------------正在爬取第{}个疫情快讯新闻------------------------------------------------'.format(count))
        count+=1
        url=str(id)
        res= requests.get(url, headers=headers)
        res.encoding = 'utf-8'
        soup = BeautifulSoup(res.content, 'html.parser')
#处理文章内容
#---------------------------------------------------------------------------------------------------------------------------------
        newlist = soup.find_all('p',style="text-indent: 2em;")
        print(url)
        #print(newlist)
        news=[]
        for i in newlist:
            i=str(i)
            news.append(i)
        text=[]
        for j in news:
            j = str(j)
            pattern = '<p style="text-indent: 2em;">\n\t(.*?)</p>'
            j = re.findall(pattern, j)
            text+=j
        anews=""#最终的新闻内容
        for j in text:
            anews+=j
#----------------------------------------------------------------------------------------------------------------------
        if len(newlist)==0:#处理另一种网页
           print("不符合")
           newlist=soup.find_all('div',class_="rm_txt_con cf")
           newlist=soup.find_all('p')
           newlist=list(newlist[5])
           for i in newlist:
              anews=i
           anews=anews.replace('\n','')
           #print(anews)
#处理时间
        pattern='/n1/(.*?)/c.*.html'
        newtime=re.findall(pattern,str(id))
        for i in newtime:
            ntime=str(i)
        f.write(url+','+anews+','+ntime+'\n')
        l=[url]+[anews]+[ntime]
        ls.append(l)
        time.sleep(random.randint(3, 7) / 10)
    f.close()
newsid = get_news_id()
print('完成newid获取共', len(newsid))
newsid_two=[]
for i in newsid:
    pattern="http://legal.people.com.cn.*"
    if re.match(pattern,i):
        newsid.remove(i)
        newsid_two.append(i)
#print(newsid)
get_news(newsid[68::])
print('已保存')
#print(newsid_two)