python统计贴吧发帖数（找水王）

最新推荐文章于 2022-03-31 20:56:08 发布

灿cc

最新推荐文章于 2022-03-31 20:56:08 发布

阅读量719

点赞数 1

分类专栏： python 文章标签： python 爬虫

本文链接：https://blog.csdn.net/sinat_34233802/article/details/53792434

版权

python 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

看着爬虫写的，原文代码有些地方不一样，修改了有问题的

# -*- coding: utf-8 -*-
"""
Created on Sat Dec 17 22:22:08 2016
统计发帖数
@author: cc
"""
import urllib
from bs4 import BeautifulSoup
import csv
import re

from collections import Counter

for k in range(0,30):
    #url = "http://tieba.baidu.com/f?李毅&ie=utf-8&pn="+ str(k*50)
    #s = '李毅'
    #s = urllib.parse.quote(s)
    req = urllib.request.Request('http://tieba.baidu.com/f?kw=%B1%BE%D6%CA%B0%F4%E6%BB&ie=utf-8&pn='+str(k*50))
    #url中有中文会出现错误，可打开网址找到对应转化后字符
    response = urllib.request.urlopen(req)
    the_page = response.read()
    soup = BeautifulSoup(the_page,'lxml')
    list1 = []
    list2 = []
    list3 = []
    list4 = []
    list5 = []
    for tag in soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}):
        list1.append("http:tieba.baidu.com"+tag['href'])
        list2.append(tag.string)
    for tag in soup.find_all(name='span', attrs={"class":re.compile("threadlist_rep_num.*")}):
        list3.append(tag.string)
    for tag in soup.find_all(name='span',attrs={'class':re.compile('tb_icon_author$')}):
       list4.append(tag['title'])
    for tag in soup.find_all(name="span", attrs={'class':re.compile('tb_icon_author_rely')}):
        list5.append(tag['title'])
    data=[]
    a=[]
    for i in range(0,len(soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}))):
        a.append((list1[i],list2[i],list3[i],list4[i]))
        #a =  bytes(a,encoding='utf-8')
        #data.append(a)
    with open('tiezi.csv', 'a',newline='', encoding='utf-8') as csvfile:
        #a--add追加文件，注意打开文件方式，wb等会出现二进制错误，
        writer = csv.writer(csvfile, dialect='excel')

        #writer = csv.writer(csvfile)    
        writer.writerows(a)
        csvfile.close()
        print(str(k)+"finished")
csvfile = open('tiezi.csv', 'r+',newline='', encoding='utf-8')
reader = csv.reader(csvfile)
list = []
for line in  reader:
    list.append(line[3])
dict = Counter(list) #统计
print(dict.most_common(5))#找出前五个