Python爬网页标题加链接地址

最新推荐文章于 2024-04-30 16:07:18 发布

molly_spring

最新推荐文章于 2024-04-30 16:07:18 发布

阅读量569

点赞数

本文链接：https://blog.csdn.net/molly_spring/article/details/126129285

版权

python

import requests,re,json,xlwt
url="E:/pythonDemo/ccc.xls"
e=xlwt.Workbook()
s=e.add_sheet("d1")

pn=1
h=0
while 1==1:
url1=f"https://guba.eastmoney.com/default,99_{pn}.html"
res=requests.get(url1)
html=res.text

#标题
gz='title="(.*?)" class="note"'
cgz=re.compile(gz)
c1=re.findall(cgz,html)

#连接地址
gz2='data-posttype=".*" href="(.*?)" title'
cgz2=re.compile(gz2)
c2=re.findall(cgz2,html)
for i in range(len(c1)):
res1=requests.get(f"https://guba.eastmoney.com{c2[i]}")
html1=res1.text

zgz="var post_article = (.*?)};"
czgz=re.compile(zgz,re.DOTALL)
cc1=re.findall(czgz,html1)
if len(cc1)!=0:
cc1=cc1[0]
content=json.loads(cc1+"}")
nr=content["post"]["post_abstract"]
s.write(h,0,c1[i])
s.write(h,1,nr)
h+=1
e.save(url)
if '>下一页</a>' in html:
pn+=1
else:
break