糗事百科爬虫用户统计

糗事百科爬虫用户统计

以前爬了糗事百科,一些数据还是挺有意思的:

  1. 工作
    这里写图片描述
    搬砖汉应该是调侃自己的,不过人数比较多的手艺汪,学生汪,家里蹲应该有一些共同点,那就是时间比较自由
  2. 故乡
    这里写图片描述
    基本上人口基数大的地方用户也多,比如河南、山东、四川等
  3. 婚姻状况
    这里写图片描述
    估计单身狗会比较多☺
  4. 星座
    这里写图片描述
  5. 注册时间
    这里写图片描述
    很多用户都是在04、05年左右注册的,看来那时候风头挺火,现在似乎不如以前了
  6. 一堆时间自由的单身狗在这上面看笑话

下面是程序:

# -*- coding: utf-8 -*-
import pymongo as mo
import jieba
import re
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
zhfont = mpl.font_manager.FontProperties(fname='wqy-microhei.ttc')

# 从列表中删除停用词
def del_stopwords(indata):
    filename = 'stopwords.txt'  # 读取停用词词典
    f = open(filename, 'r+')
    file_list = f.read()
    f.close()
    stopwords = tuple(file_list.split('\n'))
    outlwords = []
    for d in indata:
        if d[0] not in stopwords:
            outlwords.append(d)
    return outlwords


# 结巴分词并保存
def fenci():
    filename = 'qs.txt'
    f = open(filename, 'r+')
    file_list = f.read()
    f.close()
    seg_list = jieba.cut(file_list, cut_all=True)
    tf = {}
    for seg in seg_list:
        # print seg
        seg = ''.join(seg.split())
        if (seg != '' and seg != "\n" and seg != "\n\n"):
            if seg in tf:
                tf[seg] += 1
            else:
                tf[seg] = 1
    bb = sorted(tf.items(), key=lambda item: item[1], reverse=True)
    outdata = del_stopwords(bb)
    f = open("result.txt", "w+")
    for item in outdata:
        # print item
        f.write(item + "  " + str(tf[item]) + "\n")
    f.close()


# 批量删除字典中的key,输入分别为字典和需要删除key的集合
def deldic(indic, delset):
    for i in delset:
        try:
            del indic[i]
        except:
            pass


# 定义函数,检测输入的数据是否在字典中,如果在就在字典数值位加1,如果不在就在字典中新建key
def coun_dic(newkey, indic={}):
    try:
        indic[newkey] = indic[newkey] + 1
    except:
        indic[newkey] = 1
    return indic


def coun_list(indata, datalist):
    # print(indata)
    lengthdata = len(datalist)
    if indata < 0:
        indata = 0
    elif indata > lengthdata - 1:
        indata = lengthdata - 1
    datalist[indata][1] = datalist[indata][1] + 1
    return datalist


# 字典转化为列表,顺序不管
def dic2list(indic):
    aa = list(indic.keys())
    bb = []
    for a in aa:
        bb.append(indic[a])
    return aa, bb


def plotdic(indic):
    a, b = dic2list(indic)
    a[a.index('')] = '未填写'
    plt.bar(left=(range(len(indic))), height=(b))
    plt.xticks(range(len(indic)), a, fontproperties=zhfont)
    plt.show()


def gendata(db, people_att):
    work_type = {}  # 工作
    gender = {}  # 性别
    constellation = {}  # 星座
    marriage = {}
    hometown = {}
    peopname = {}
    # 统计所有人在线时间
    totaltime_count = [[x, 0] for x in range(2500)]
    # 统计段子数大于10的在线时间
    totaltime_count1 = [[x, 0] for x in range(2500)]
    # 所有人段子量统计
    qiushi_count = [[x, 0] for x in range(3000)]
    kk = 1
    joke = set()
    # kmeandata = []
    for d in db.find():
        try:
            totaltime_count = coun_list(int(re.sub("\D", "", d['total_time'])), totaltime_count)
            if len(d) > 25:
                totaltime_count1 = coun_list(int(re.sub("\D", "", d['total_time'])), totaltime_count1)
            # print(len(d))
            peopname = coun_dic(d['name'], peopname)
            qiushi_count = coun_list(len(d) - 15, qiushi_count)
            work_type = coun_dic(d['job'], work_type)
            constellation = coun_dic(d['horoscope'], constellation)
            marriage = coun_dic(d['marri'], marriage)
            if d['hometown'] == '':
                hometown = coun_dic('', hometown)
            elif d['hometown'] == '未知':
                hometown = coun_dic('未知', hometown)
            elif len(d['hometown'].split(' · ')) == 1:
                hometown = coun_dic(d['hometown'], hometown)
            else:
                hometown = coun_dic(d['hometown'].split(' · ')[0], hometown)
            # 保存笑话
            deldic(d, people_att)
            for i in d:
                if len(d[i]['body']) > 10:
                    joke.add(d[i]['body'])
                    # kmeandata.append(
                    #     [int(re.sub("\D", "", d['total_time'])), int(d['comment_num']), int(d['face_num']), int(d['qiushi_num'])])
        except Exception as e:
            print(str(e))
            print(d)
            aa = d
            os._exit()
        kk = kk + 1
    peopatt = {'work_type': work_type, 'marriage': marriage, 'constellation': constellation, 'hometown': hometown,
               'totaltime_count': totaltime_count, 'totaltime_count1': totaltime_count1, 'peopname': peopname}
    return joke, peopatt


if __name__ == '__main__':
    people_att = {'_id', 'name', 'pic', 'funs_num', 'atten_num', 'qiushi_num',
                  'comment_num', 'face_num', 'choice_num', 'marri',
                  'horoscope', 'job', 'hometown', 'total_time', 'face_num', 'flag'}
    client = mo.MongoClient('localhost', 27017)
    databases_name = 'qsbk2'
    tablename = 'qsbk2'
    db = client[databases_name][tablename]
    joke, peopatt = gendata(db, people_att)
    # 出图工作
    plotdic(peopatt['work_type'])
    # 出图婚姻
    plotdic(peopatt['marriage'])
    # 出图星座
    plotdic(peopatt['constellation'])
    # 出图家乡
    plotdic(peopatt['hometown'])
    # 在线时间
    plt.plot([x[1] for x in peopatt['totaltime_count']], label=u'注册时间', )
    xtic = [x * 365 + 285 for x in range(7)]  # 截止10月初
    xtic.insert(0, 1)
    plt.xticks(xtic, ['2017.10', 2017, 2016, 2015, 2014, 2013, 2012, 2011], fontproperties=zhfont)
    plt.plot([x[1] for x in peopatt['totaltime_count1']], label=u'段子数大于25的注册时间')
    xtic = [x * 365 + 285 for x in range(7)]  # 截止10月初
    xtic.insert(0, 1)
    plt.xticks(xtic, ['2017.10', 2017, 2016, 2015, 2014, 2013, 2012, 2011])
    plt.legend(prop=zhfont)
    plt.show()

    print('总共段子个数', len(joke))
    client.close()

    print('finish')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
我们可以使用Python编写爬虫来获取统计局的GDP数据。首先,我们需要使用爬虫技术获取网页的HTML代码。然后,我们可以使用Python的库(如BeautifulSoup)来解析HTML代码,并提取所需的数据。最后,我们可以将提取的数据保存到本地文件或进行进一步的处理和分析。 以下是一个简单的示例代码,可以用来爬取北京市2020的GDP数据: ```python import requests from bs4 import BeautifulSoup url = 'http://tjj.beijing.gov.cn/tjsj_31433/yjdsj_31440/gdp_31750/2020/index.html' response = requests.get(url) soup = BeautifulSoup(response.text, 'html.parser') # 根据网页结构找到需要的数据 table = soup.find('table', class_='tj_table') rows = table.find_all('tr') for row in rows: cells = row.find_all('td') if len(cells) > 0: year = cells。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [python 爬虫爬取统计局北京统计数据](https://blog.csdn.net/weixin_44902220/article/details/106773295)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v92^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* *3* [Python 网络爬虫数据可视化](https://blog.csdn.net/gezongbo/article/details/112484162)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v92^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值