糗事百科爬虫用户统计

最新推荐文章于 2019-02-19 21:38:30 发布

ljj123zz

最新推荐文章于 2019-02-19 21:38:30 发布

阅读量341

点赞数

文章标签：糗事百科爬虫数据统计

本文链接：https://blog.csdn.net/ljj123zz/article/details/78804926

版权

糗事百科爬虫用户统计

以前爬了糗事百科，一些数据还是挺有意思的:

工作

搬砖汉应该是调侃自己的，不过人数比较多的手艺汪，学生汪，家里蹲应该有一些共同点，那就是时间比较自由
故乡

基本上人口基数大的地方用户也多，比如河南、山东、四川等
婚姻状况

估计单身狗会比较多☺
星座
注册时间

很多用户都是在04、05年左右注册的，看来那时候风头挺火，现在似乎不如以前了
一堆时间自由的单身狗在这上面看笑话

下面是程序：

# -*- coding: utf-8 -*-
import pymongo as mo
import jieba
import re
import matplotlib.pyplot as plt
import matplotlib as mpl
import os
zhfont = mpl.font_manager.FontProperties(fname='wqy-microhei.ttc')

# 从列表中删除停用词
def del_stopwords(indata):
    filename = 'stopwords.txt'  # 读取停用词词典
    f = open(filename, 'r+')
    file_list = f.read()
    f.close()
    stopwords = tuple(file_list.split('\n'))
    outlwords = []
    for d in indata:
        if d[0] not in stopwords:
            outlwords.append(d)
    return outlwords


# 结巴分词并保存
def fenci():
    filename = 'qs.txt'
    f = open(filename, 'r+')
    file_list = f.read()
    f.close()
    seg_list = jieba.cut(file_list, cut_all=True)
    tf = {}
    for seg in seg_list:
        # print seg
        seg = ''.join(seg.split())
        if (seg != '' and seg != "\n" and seg != "\n\n"):
            if seg in tf:
                tf[seg] += 1
            else:
                tf[seg] = 1
    bb = sorted(tf.items(), key=lambda item: item[1], reverse=True)
    outdata = del_stopwords(bb)
    f = open("result.txt", "w+")
    for item in outdata:
        # print item
        f.write(item + "  " + str(tf[item]) + "\n")
    f.close()


# 批量删除字典中的key，输入分别为字典和需要删除key的集合
def deldic(indic, delset):
    for i in delset:
        try:
            del indic[i]
        except:
            pass


# 定义函数，检测输入的数据是否在字典中，如果在就在字典数值位加1，如果不在就在字典中新建key
def coun_dic(newkey, indic={}):
    try:
        indic[newkey] = indic[newkey] + 1
    except:
        indic[newkey] = 1
    return indic


def coun_list(indata, datalist):
    # print(indata)
    lengthdata = len(datalist)
    if indata < 0:
        indata = 0
    elif indata > lengthdata - 1:
        indata = lengthdata - 1
    datalist[indata][1] = datalist[indata][1] + 1
    return datalist


# 字典转化为列表，顺序不管
def dic2list(indic):
    aa = list(indic.keys())
    bb = []
    for a in aa:
        bb.append(indic[a])
    return aa, bb


def plotdic(indic):
    a, b = dic2list(indic)
    a[a.index('')] = '未填写'
    plt.bar(left=(range(len(indic))), height=(b))
    plt.xticks(range(len(indic)), a, fontproperties=zhfont)
    plt.show()


def gendata(db, people_att):
    work_type = {}  # 工作
    gender = {}  # 性别
    constellation = {}  # 星座
    marriage = {}
    hometown = {}
    peopname = {}
    # 统计所有人在线时间
    totaltime_count = [[x, 0] for x in range(2500)]
    # 统计段子数大于10的在线时间
    totaltime_count1 = [[x, 0] for x in range(2500)]
    # 所有人段子量统计
    qiushi_count = [[x, 0] for x in range(3000)]
    kk = 1
    joke = set()
    # kmeandata = []
    for d in db.find():
        try:
            totaltime_count = coun_list(int(re.sub("\D", "", d['total_time'])), totaltime_count)
            if len(d) > 25:
                totaltime_count1 = coun_list(int(re.sub("\D", "", d['total_time'])), totaltime_count1)
            # print(len(d))
            peopname = coun_dic(d['name'], peopname)
            qiushi_count = coun_list(len(d) - 15, qiushi_count)
            work_type = coun_dic(d['job'], work_type)
            constellation = coun_dic(d['horoscope'], constellation)
            marriage = coun_dic(d['marri'], marriage)
            if d['hometown'] == '':
                hometown = coun_dic('', hometown)
            elif d['hometown'] == '未知':
                hometown = coun_dic('未知', hometown)
            elif len(d['hometown'].split(' · ')) == 1:
                hometown = coun_dic(d['hometown'], hometown)
            else:
                hometown = coun_dic(d['hometown'].split(' · ')[0], hometown)
            # 保存笑话
            deldic(d, people_att)
            for i in d:
                if len(d[i]['body']) > 10:
                    joke.add(d[i]['body'])
                    # kmeandata.append(
                    #     [int(re.sub("\D", "", d['total_time'])), int(d['comment_num']), int(d['face_num']), int(d['qiushi_num'])])
        except Exception as e:
            print(str(e))
            print(d)
            aa = d
            os._exit()
        kk = kk + 1
    peopatt = {'work_type': work_type, 'marriage': marriage, 'constellation': constellation, 'hometown': hometown,
               'totaltime_count': totaltime_count, 'totaltime_count1': totaltime_count1, 'peopname': peopname}
    return joke, peopatt


if __name__ == '__main__':
    people_att = {'_id', 'name', 'pic', 'funs_num', 'atten_num', 'qiushi_num',
                  'comment_num', 'face_num', 'choice_num', 'marri',
                  'horoscope', 'job', 'hometown', 'total_time', 'face_num', 'flag'}
    client = mo.MongoClient('localhost', 27017)
    databases_name = 'qsbk2'
    tablename = 'qsbk2'
    db = client[databases_name][tablename]
    joke, peopatt = gendata(db, people_att)
    # 出图工作
    plotdic(peopatt['work_type'])
    # 出图婚姻
    plotdic(peopatt['marriage'])
    # 出图星座
    plotdic(peopatt['constellation'])
    # 出图家乡
    plotdic(peopatt['hometown'])
    # 在线时间
    plt.plot([x[1] for x in peopatt['totaltime_count']], label=u'注册时间', )
    xtic = [x * 365 + 285 for x in range(7)]  # 截止10月初
    xtic.insert(0, 1)
    plt.xticks(xtic, ['2017.10', 2017, 2016, 2015, 2014, 2013, 2012, 2011], fontproperties=zhfont)
    plt.plot([x[1] for x in peopatt['totaltime_count1']], label=u'段子数大于25的注册时间')
    xtic = [x * 365 + 285 for x in range(7)]  # 截止10月初
    xtic.insert(0, 1)
    plt.xticks(xtic, ['2017.10', 2017, 2016, 2015, 2014, 2013, 2012, 2011])
    plt.legend(prop=zhfont)
    plt.show()

    print('总共段子个数', len(joke))
    client.close()

    print('finish')

ljj123zz

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
糗事百科爬虫用户统计

糗事百科爬虫用户统计以前爬了糗事百科，一些数据还是挺有意思的:工作搬砖汉应该是调侃自己的，不过人数比较多的手艺汪，学生汪，家里蹲应该有一些共同点，那就是时间比较自由故乡基本上人口基数大的地方用户也多，比如河南、山东、四川等婚姻状况估计单身狗会比较多☺星座注册时间很多用户都是在04、05年左右注册的，看来那时候风头挺火，现在似乎不如以前了总结下来就是一堆时间
复制链接

扫一扫