python 爬虫+jieba词频统计分析

最新推荐文章于 2025-04-29 15:28:57 发布

m0_68132033

最新推荐文章于 2025-04-29 15:28:57 发布

阅读量219

点赞数 2

文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/m0_68132033/article/details/136787549

版权

本文介绍了如何使用Python的requests库进行网页抓取，获取CNINFO网站的公告数据，包括设置cookies和headers，以及后续对PDF文件进行词频分析的过程。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

爬虫：request

import requests


def get_href(city):
    global download_date
    import requests

    cookies = {
        'JSESSIONID': '506C71095EA2B50B816F396D73C4DBAA',
        'SF_cookie_4': '17470996',
        'insert_cookie': '45380249',
        'routeId': '.uc1',
        'SID': 'f3fa0ca8-0d56-4427-876c-7698caf92ce8',
        '_sp_id.2141': '79f5e3d8-a804-449c-8840-8f25140e79fe.1710642179.1.1710643653.1710642179.d82be77e-e408-46d9-9ae2-65feacfb8944',
    }

    headers = {
        'Accept': '*/*',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'no-cache',
        'Connection': 'keep-alive',
        'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
        # 'Cookie': 'JSESSIONID=506C71095EA2B50B816F396D73C4DBAA; SF_cookie_4=17470996; insert_cookie=45380249; routeId=.uc1; SID=f3fa0ca8-0d56-4427-876c-7698caf92ce8; _sp_id.2141=79f5e3d8-a804-449c-8840-8f25140e79fe.1710642179.1.1710643653.1710642179.d82be77e-e408-46d9-9ae2-65feacfb8944',
        'Origin': 'http://www.cninfo.com.cn',
        'Pragma': 'no-cache',
        'Referer': 'http://www.cninfo.com.cn/new/commonUrl/pageOfSearch?url=disclosure/list/search&lastPage=index',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest',
    }

    data = {
        'pageNum': '1',
        'pageSize': '30',
        'column': 'szse',
        'tabName': 'fulltext',
        'plate': '',
        'stock': '',
        'searchkey': "",
        'secid': '',
        'category': 'category_ndbg_szsh',
        'trade': '电力、热力、燃气及水生产和供应业;采矿业',
        'seDate': '2023-09-17~2024-03-17',
        'sortName': '',
        'sortType': '',
        'isHLtitle': 'true',
    }

    response = requests.post(
        'http://www.cninfo.com.cn/new/hisAnnouncement/query',
        cookies=cookies,
        headers=headers,
        data=data,
        verify=False,
    )

    result = response.json()['announcements']
    url = []
    date_dict = {}
    flag = 1
    if result is not None:
        for i in result:
            download_url = f'http://www.cninfo.com.cn/new/announcement/download?bulletinId={i["announcementId"]}&announceTime={i["announcementTime"]}'
            print(download_url)
            download_date = requests.get(url=download_url, headers=headers)
            print(download_date.status_code)

            with open(f"{flag}.pdf", "wb") as f:
                f.write(download_date.content)
                flag += 1



def read_pdf():
    pass

if __name__ == '__main__':
    city1 = ['厦门', '郑州', '济南', '宁波', '贵阳', '沈阳', '包头银川', '南昌']
    url_dict = dict()
    for i in city1:
        url = get_href(i)

jieba词频分析：PyPDF2+jieba

import PyPDF2

from jieba import analyse


def read_pdf_text(filename):
    print(filename)
    with open(filename, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        res = []
        # 遍历PDF中的每一页
        for page_num in range(len(reader.pages)):
            text = reader.pages[page_num].extract_text()
            res.append(text)
        return res




if __name__ == '__main__':
    text = []
    for i in range(1, 10):
        text.append("".join(read_pdf_text(f'{i}.pdf')))
    print(text)
    analyse.set_stop_words(r'./stopwordlist.txt')
    text = str(text)

    keywords = analyse.extract_tags(text, topK=50, withWeight=True, allowPOS=())
    for keyword, weight in keywords:
        print(keyword, weight)