下载新闻网页内容,用于后期相关分析。超简单的爬取新闻网页例子。
import requests
import chardet
import http.cookiejar
import os
from lxml import etree
def geturl(url0):
# 访问澎湃新闻页面
headers={"Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",\
"Accept-Encoding":"gzip, deflate",\
"Accept-Language":"zh-CN,zh;q=0.9",\
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36",\
"Host":"m.thepaper.cn",\
"Sec-Fetch-Dest":"document",\
"Sec-Fetch-Mode":"navigate",\
"Sec-Fetch-Site":"none",\
"Connection":"keep-alive",\
"Upgrade-Insecure-Requests":"1"
}
# 设置cookie
filename='cookie_pengpainews.txt' #cookie原文件
cookie=http.cookiejar.MozillaCookieJar()
cookie.load(filename,ignore_discard=True,ignore_expires=True)
#for item in cookie:
#time.sleep(0.1)
#print(item.name,'\t',item.value,'\t',url0)
#清理requests异常告警
requests.packages.urllib3.disable_warnings()
flag=5
data0=''
while flag:
r=requests.get(url0,timeout=0.5,verify=False,headers=headers,cookies=cookie,stream=False)
r_ok=r.status_code
r_code=r.encoding
if r_ok != 200:
flag=flag-1
else:
flag=0
print(url0,r_ok,r_code,flag)
data0=r.content.decode()
return data0
def clow(path1,num0):
#下载澎湃新闻网页内容,保存在目标文件夹内
url='https://m.thepaper.cn/baijiahao_'+str(num0)
try:
data0=geturl(url)
except:
data0=''
if data0:
parse_html=etree.HTML(data0)
timu='' #新闻标题
lst='' #新闻段落
timer='' #新闻时间
try:
lst=parse_html.xpath("//article//p[@class='contentFont']/text()")
timu=parse_html.xpath("//div[@class='header']//div[@id='title']/text()")
timu=''.join(timu)
timer=parse_html.xpath("//div[@class='info link']//span[@data-href]/text()")
except:
pass
ter=timer[0].strip()
#新闻信息全时保存新闻内容
if timu and lst and ter:
cc='\/:*?"<>|'
ti=[tt for tt in timu if tt not in cc]
ti0=str(num0)+'_'+''.join(ti)
timu=os.path.join(path1,ti0)
with open(timu+'.txt','w',encoding ='utf-8')as fp:
fp.write(ter+'\n'+''.join(lst))
print(url,len(data0),'-------ok------',timu,ter)
else:
print(url,'-------error------')
#如果目标文件夹不存在就创建文件夹
path0=os.getcwd()
fod=r'\baidunews\pengpai_1'
path1=path0+fod
if not os.path.exists(path1):
os.mkdir(path1)
#构造网页地址信息,开始下载新闻内容
for i in range(11002819,12101000):
try:
lst=clow(path1,i)
except:
pass
关注Python开发练习,200G学习资源免费送,还可以免费处理2.5小时以内的各类小Task。