爬虫:request
import requests
def get_href(city):
global download_date
import requests
cookies = {
'JSESSIONID': '506C71095EA2B50B816F396D73C4DBAA',
'SF_cookie_4': '17470996',
'insert_cookie': '45380249',
'routeId': '.uc1',
'SID': 'f3fa0ca8-0d56-4427-876c-7698caf92ce8',
'_sp_id.2141': '79f5e3d8-a804-449c-8840-8f25140e79fe.1710642179.1.1710643653.1710642179.d82be77e-e408-46d9-9ae2-65feacfb8944',
}
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'no-cache',
'Connection': 'keep-alive',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
# 'Cookie': 'JSESSIONID=506C71095EA2B50B816F396D73C4DBAA; SF_cookie_4=17470996; insert_cookie=45380249; routeId=.uc1; SID=f3fa0ca8-0d56-4427-876c-7698caf92ce8; _sp_id.2141=79f5e3d8-a804-449c-8840-8f25140e79fe.1710642179.1.1710643653.1710642179.d82be77e-e408-46d9-9ae2-65feacfb8944',
'Origin': 'http://www.cninfo.com.cn',
'Pragma': 'no-cache',
'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
data = {
'pageNum': '1',
'pageSize': '30',
'column': 'szse',
'tabName': 'fulltext',
'plate': '',
'stock': '',
'searchkey': "",
'secid': '',
'category': 'category_ndbg_szsh',
'trade': '电力、热力、燃气及水生产和供应业;采矿业',
'seDate': '2023-09-17~2024-03-17',
'sortName': '',
'sortType': '',
'isHLtitle': 'true',
}
response = requests.post(
'http://www.cninfo.com.cn/new/hisAnnouncement/query',
cookies=cookies,
headers=headers,
data=data,
verify=False,
)
result = response.json()['announcements']
url = []
date_dict = {}
flag = 1
if result is not None:
for i in result:
download_url = f'http://www.cninfo.com.cn/new/announcement/download?bulletinId={i["announcementId"]}&announceTime={i["announcementTime"]}'
print(download_url)
download_date = requests.get(url=download_url, headers=headers)
print(download_date.status_code)
with open(f"{flag}.pdf", "wb") as f:
f.write(download_date.content)
flag += 1
def read_pdf():
pass
if __name__ == '__main__':
city1 = ['厦门', '郑州', '济南', '宁波', '贵阳', '沈阳', '包头银川', '南昌']
url_dict = dict()
for i in city1:
url = get_href(i)
jieba词频分析:PyPDF2+jieba
import PyPDF2
from jieba import analyse
def read_pdf_text(filename):
print(filename)
with open(filename, 'rb') as file:
reader = PyPDF2.PdfReader(file)
res = []
# 遍历PDF中的每一页
for page_num in range(len(reader.pages)):
text = reader.pages[page_num].extract_text()
res.append(text)
return res
if __name__ == '__main__':
text = []
for i in range(1, 10):
text.append("".join(read_pdf_text(f'{i}.pdf')))
print(text)
analyse.set_stop_words(r'./stopwordlist.txt')
text = str(text)
keywords = analyse.extract_tags(text, topK=50, withWeight=True, allowPOS=())
for keyword, weight in keywords:
print(keyword, weight)