【python】晋江原耽积分榜前5000文的标签统计

代码:
文件1——分段统计成txt:

import requests
from lxml import etree, html
from bs4 import BeautifulSoup
import json

# 根据排行榜每页的url规律,构造出排行榜页面的url:
def get_url_1(page):
    head = 'http://www.jjwxc.net/bookbase.php?fw0=0&fbsj=0&ycx1=1&xx2=2&mainview0=0&sd0=0&lx0=0&fg0=0&sortType=2&isfinish=0&collectiontypes=ors&searchkeywords=&page='
    # tail = '&isfinish=0&collectiontypes=ors&searchkeywords='
    var = str(page)
    # url = head + var + tail
    url = head + var
    return url

# 构造每篇文章的url:
def get_url_2(number):
    head = 'http://www.jjwxc.net/'
    var = str(number)
    url = head + var
    return url

# 根据排行榜url,获取各文章的链接:
def get_info_1(url_1):
    # 伪装成浏览器,防止封ip
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        'Host': 'www.jjwxc.net',
        'Cookie': '__gads=ID=adb1bd199ca80e42:T=1592032062:S=ALNI_MbfwuQah_VUIJ0eFciwrmcI0YVBcQ; CNZZDATA1275156903=1619785691-1592040877-null%7C1592040877; __cfduid=d5561249470bba6af47bba14f331c644e1592045059; UM_distinctid=172ad47ba1a345-0e33c371452b35-f7d123e-144000-172ad47ba1de; timeOffset_o=-395.800048828125; CNZZDATA30075907=cnzz_eid%3D149420612-1592030978-null%26ntime%3D1594286393; testcookie=yes; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1592471583,1594093184,1594178729,1594290569; token=Mjk5MDY4Mjd8ZWE4Y2Q1ZTc5OTIzZjNjNDgxMmNmZjU5NDI1MGEyMzl8fHx8MTA4MDB8MXx8fOaZi%2Baxn%2BeUqOaIt3wxfG1vYmlsZQ%3D%3D; JJSESS=%7B%22clicktype%22%3A%22%22%2C%22nicknameAndsign%22%3A%222%257E%2529%2524%25E6%25B4%25BB%25E7%259D%2580%25E4%25B8%25BA%25E4%25BA%2586%25E4%25BB%2580%25E4%25B9%2588%22%7D; JJEVER=%7B%22sms_total%22%3A%220%22%2C%22fenzhan%22%3A%22noyq%22%2C%22ispayuser%22%3A%2229906827-1%22%2C%22foreverreader%22%3A%2229906827%22%2C%22user_signin_days%22%3A%2220200709_29906827_3%22%7D; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1594291386',
    }
    tries = 30
    rsp = ""
    while tries > 0:
        try:
            rsp = requests.get(url_1, headers=headers)

            break
        except Exception as e:
            tries -= 1
    # 防止中文乱码
    rsp.encoding = rsp.apparent_encoding
    data = rsp.text
    print(data)
    selector = etree.HTML(data)
    # 获取一页所有文章的超链接
    url_2 = selector.xpath('//html//body//table//tbody//tr//td[2]//a/@href')
    print(url_2)
    return url_2

# 获取文章页面的标签:
def get_info_2(url_2):
    # 伪装成浏览器,防止封ip
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36',
        'Host': 'www.jjwxc.net',
        'Cookie': '__gads=ID=adb1bd199ca80e42:T=1592032062:S=ALNI_MbfwuQah_VUIJ0eFciwrmcI0YVBcQ; CNZZDATA1275156903=1619785691-1592040877-null%7C1594437918; __cfduid=df5911758d582c90c410614cae0204c661595644212; UM_distinctid=17383ce87ff12-0b5ff82dc4d561-b7a1334-144000-17383ce88005aa; CNZZDATA30075907=cnzz_eid%3D149420612-1592030978-null%26ntime%3D1598006596; testcookie=yes; timeOffset_o=3822.39990234375; Hm_lvt_bc3b748c21fe5cf393d26c12b2c38d99=1597378421,1597635899,1597657203,1598007103; token=Mjk5MDY4Mjd8Y2NiYzJlMTE1OTU0NzI3NGRjZjNhNGM4YTNlMTZhNmV8fHx8MTA4MDB8MXx8fOaZi%2Baxn%2BeUqOaIt3wxfGpqcmVhZGVy; JJSESS=%7B%22sidkey%22%3A%22pmZ5hDLY8uXBPKeOAGCRIgfsF1vHcMNUEQyo23xq%22%2C%22nicknameAndsign%22%3A%222%257E%2529%2524%25E6%25B4%25BB%25E7%259D%2580%25E4%25B8%25BA%25E4%25BA%2586%25E4%25BB%2580%25E4%25B9%2588%22%7D; JJEVER=%7B%22ispayuser%22%3A%2229906827-1%22%2C%22foreverreader%22%3A%2229906827%22%2C%22user_signin_days%22%3A%2220200821_29906827_2%22%2C%22sms_total%22%3A0%2C%22fenzhan%22%3A%22noyq%22%7D; Hm_lpvt_bc3b748c21fe5cf393d26c12b2c38d99=1598007308',
    }
    tries = 30
    rsp = ""
    while tries > 0:
        try:
            rsp = requests.get(url_2, headers=headers)
            break
        except Exception as e:
            tries -= 1
    soup = BeautifulSoup(rsp.content, 'lxml')
    # 获取标签
    head_tot = soup.find_all('a', target="_blank", style="text-decoration:none;color: red;")
    # 排除文案修改中情况
    if head_tot == []:
        return []
    head_text = []
    # 获取标签文本
    for i in head_tot:
        head_text.append(i.get_text())
    return head_text

# 字典保存为txt文件:
def save_dict(dict, which):
    name_str = which + '.txt'
    js = json.dumps(dict)
    file = open(name_str, 'w')
    file.write(js)
    file.close()

def save_it(label_list, min_page, max_page):
    # 统计成字典形式
    word_num = {}
    for i in range(len(label_list)):
        word_num[label_list[i]] = label_list.count(label_list[i])
    # 按value值排序
    word_num = sorted(word_num.items(), key=lambda x: x[1], reverse=True)
    print(word_num)
    # 保存为txt文件
    its_label = "标签的统计" + " " + str(min_page) + "--" + str(max_page-1)
    save_dict(word_num, its_label)
    return

def main():
    # 获取标签列表:
    label = []
    min = 50
    max = 51
    for page in range(min, max):       # 每页
        print("第", page, "页")
        link_1 = get_url_1(page)
        link_2_half = get_info_1(link_1)
        label_page = []
        for num in range(0, len(link_2_half)):   # 每篇文章
            print("文章位:", page, ".", num)
            link_2 = get_url_2(link_2_half[num])
            label_sig = get_info_2(link_2)
            label_page += label_sig
        print(label_page)
        if label_page == []:
            return
        label += label_page
    # 去除空元素:
    label = [x.strip() for x in label if x.strip() != '']
    # 统计成字典,并保存:
    save_it(label, min, max)
    return


if __name__ == '__main__':
    main()

文件2——统计所有txt的字典:

import json
from collections import Counter

# 读取单个txt的数据并转化为字典输出
def list_to_dict(name):
    file = open(name, 'r')
    js = file.read()
    dic = dict(json.loads(js))
    print(dic)
    file.close()
    return dic

def main():
    file_name = ["标签的统计 10--11.txt", "标签的统计 12--19.txt", "标签的统计 20--29.txt", "标签的统计 30--49.txt", "标签的统计 50--50.txt"]
    dict_1 = list_to_dict("标签的统计 1--9.txt")
    for sig_name in file_name:
        dict_2 = list_to_dict(sig_name)
        X, Y = Counter(dict_1), Counter(dict_2)
        dict_1 = dict(X + Y)
    print(dict_1)
    return

if __name__ == '__main__':
    main()

觉得有趣的或对你有帮助的可以点个赞~
ps.已查过Robots协议,但还是说一句,侵立刻就删。

  • 2
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值