话不多说直接上代码:
from wsgiref import headers
import requests
from bs4 import BeautifulSoup
import re
#证卷日报爬取标题,日期,内容
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43"
}
request_url="http://search.zqrb.cn/search.php?src=all&q=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&f=_all&s=newsdate_DESC"
page_content=requests.get(url=request_url,headers=header).text
#正则表达式匹配
page_href=' <a href="(.*?)" target="_blank"><h4>.*? </h4></a>'
page_title='<a href=".*?" target="_blank"><h4>(.*?)</h4></a>'
page_time='<span><strong>时间:</strong>(.*?)</span>'
page_href=re.findall(page_href,page_content)
page_title=re.findall(page_title,page_content)
page_time=re.findall(page_time,page_content)
for i in range(len(page_title)):
#数据清洗
page_title[i]=re.sub('<.*?>','',page_title[i])
print("链接为:"+page_href[i]+" 标题为:"+page_title[i]+" 时间为"+page_time[i]+"\n")
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43"
}
request_url="http://search.zqrb.cn/search.php?src=all&q=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&f=_all&s=newsdate_DESC"
page_content=requests.get(url=request_url,headers=header).text
soup=BeautifulSoup(page_content,"html.parser")
a=soup.select(".result-list dt a")
for i in range(len(a)):
print("网址为:"+a[i]['href']+" 内容为:"+a[i].text)
#中证网爬取标题,日期,内容
header={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.80 Safari/537.36 Edg/98.0.1108.43"
}
request_url="http://search.cs.com.cn/search?searchword=%E8%B4%B5%E5%B7%9E%E8%8C%85%E5%8F%B0&channelid=215308"
page_content=requests.get(url=request_url,headers=header).text
patern_href='<a style="font-size: 16px;color: #0066ff;line-height: 20px" href="(.*?)" target="_blank">.*?</a>'
patern_title='<a style="font-size: 16px;color: #0066ff;line-height: 20px" href=".*?" target="_blank">(.*?)</a>'
patern_time=" .*? (.*?)</td>"
page_href=re.findall(patern_href,page_content)
page_title=re.findall(patern_title,page_content)
page_time=re.findall(patern_time,page_content,re.S)
for i in range(len(page_href)):
print("链接为:"+page_href[i]+" "+"题目为:"+page_title[i]+" 时间:"+page_time[i])