Python开发练习-简单的单线程下载澎湃新闻网页内容例子

最新推荐文章于 2021-05-01 16:57:37 发布

tocreateone

最新推荐文章于 2021-05-01 16:57:37 发布

阅读量142

点赞数

分类专栏： Python 文章标签： python 数据挖掘 xpath

本文链接：https://blog.csdn.net/tocreateone/article/details/115529137

版权

Python 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

下载新闻网页内容，用于后期相关分析。超简单的爬取新闻网页例子。

import requests
import chardet
import http.cookiejar
import os
from lxml import etree



def geturl(url0):
    # 访问澎湃新闻页面
    headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",\
             "Accept-Encoding":"gzip, deflate",\
             "Accept-Language":"zh-CN,zh;q=0.9",\
             "User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",\
             
             "Host":"m.thepaper.cn",\
             "Sec-Fetch-Dest":"document",\
             "Sec-Fetch-Mode":"navigate",\
             "Sec-Fetch-Site":"none",\
             "Connection":"keep-alive",\
             "Upgrade-Insecure-Requests":"1"
             }
    # 设置cookie
    filename='cookie_pengpainews.txt'                                      #cookie原文件
    cookie=http.cookiejar.MozillaCookieJar()
    cookie.load(filename,ignore_discard=True,ignore_expires=True)
    #for item in cookie:
        #time.sleep(0.1)
        #print(item.name,'\t',item.value,'\t',url0)    
    #清理requests异常告警
    requests.packages.urllib3.disable_warnings()
    flag=5
    data0=''
    while flag:
        r=requests.get(url0,timeout=0.5,verify=False,headers=headers,cookies=cookie,stream=False)
        r_ok=r.status_code
        r_code=r.encoding
        if r_ok != 200:
            flag=flag-1
        else:
            flag=0
    print(url0,r_ok,r_code,flag)
    data0=r.content.decode()
    return data0

def clow(path1,num0):
    #下载澎湃新闻网页内容，保存在目标文件夹内
    url='https://m.thepaper.cn/baijiahao_'+str(num0)
    try:
        data0=geturl(url)
    except:
        data0=''
    if data0:
        parse_html=etree.HTML(data0)
        timu=''                 #新闻标题
        lst=''                  #新闻段落
        timer=''                #新闻时间
        try:
            lst=parse_html.xpath("//article//p[@class='contentFont']/text()")
            timu=parse_html.xpath("//div[@class='header']//div[@id='title']/text()")
            timu=''.join(timu)
            timer=parse_html.xpath("//div[@class='info link']//span[@data-href]/text()")
        except:
            pass
        ter=timer[0].strip()
        #新闻信息全时保存新闻内容
        if timu and lst and ter:
            cc='\/:*?"<>|'
            ti=[tt for tt in timu if tt not in cc]
            ti0=str(num0)+'_'+''.join(ti)
            timu=os.path.join(path1,ti0)
            with open(timu+'.txt','w',encoding ='utf-8')as fp:
                fp.write(ter+'\n'+''.join(lst))
            print(url,len(data0),'-------ok------',timu,ter)
        else:
            print(url,'-------error------')
        
#如果目标文件夹不存在就创建文件夹
path0=os.getcwd()
fod=r'\baidunews\pengpai_1'
path1=path0+fod
if not os.path.exists(path1):
    os.mkdir(path1)
#构造网页地址信息，开始下载新闻内容
for i in range(11002819,12101000):
    try:
        lst=clow(path1,i)
    except:
        pass

关注Python开发练习，200G学习资源免费送，还可以免费处理2.5小时以内的各类小Task。

tocreateone

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Python开发练习-简单的单线程下载澎湃新闻网页内容例子

下载新闻网页内容，用于后期相关分析。超简单的爬取新闻网页例子。import requestsimport chardetimport http.cookiejarimport osfrom lxml import etreedef geturl(url0): # 访问澎湃新闻页面 headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,ima
复制链接

扫一扫