python统计贴吧发帖数(找水王)

看着爬虫写的,原文代码有些地方不一样,修改了有问题的

# -*- coding: utf-8 -*-
"""
Created on Sat Dec 17 22:22:08 2016
统计发帖数
@author: cc
"""
import urllib
from bs4 import BeautifulSoup
import csv
import re

from collections import Counter

for k in range(0,30):
    #url = "http://tieba.baidu.com/f?李毅&ie=utf-8&pn="+ str(k*50)
    #s = '李毅'
    #s = urllib.parse.quote(s)
    req = urllib.request.Request('http://tieba.baidu.com/f?kw=%B1%BE%D6%CA%B0%F4%E6%BB&ie=utf-8&pn='+str(k*50))
    #url中有中文会出现错误,可打开网址找到对应转化后字符
    response = urllib.request.urlopen(req)
    the_page = response.read()
    soup = BeautifulSoup(the_page,'lxml')
    list1 = []
    list2 = []
    list3 = []
    list4 = []
    list5 = []
    for tag in soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}):
        list1.append("http:tieba.baidu.com"+tag['href'])
        list2.append(tag.string)
    for tag in soup.find_all(name='span', attrs={"class":re.compile("threadlist_rep_num.*")}):
        list3.append(tag.string)
    for tag in soup.find_all(name='span',attrs={'class':re.compile('tb_icon_author$')}):
       list4.append(tag['title'])
    for tag in soup.find_all(name="span", attrs={'class':re.compile('tb_icon_author_rely')}):
        list5.append(tag['title'])
    data=[]
    a=[]
    for i in range(0,len(soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}))):
        a.append((list1[i],list2[i],list3[i],list4[i]))
        #a =  bytes(a,encoding='utf-8')
        #data.append(a)
    with open('tiezi.csv', 'a',newline='', encoding='utf-8') as csvfile:
        #a--add追加文件,注意打开文件方式,wb等会出现二进制错误,
        writer = csv.writer(csvfile, dialect='excel')

        #writer = csv.writer(csvfile)    
        writer.writerows(a)
        csvfile.close()
        print(str(k)+"finished")
csvfile = open('tiezi.csv', 'r+',newline='', encoding='utf-8')
reader = csv.reader(csvfile)
list = []
for line in  reader:
    list.append(line[3])
dict = Counter(list) #统计
print(dict.most_common(5))#找出前五个
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值