看着爬虫写的,原文代码有些地方不一样,修改了有问题的
# -*- coding: utf-8 -*-
"""
Created on Sat Dec 17 22:22:08 2016
统计发帖数
@author: cc
"""
import urllib
from bs4 import BeautifulSoup
import csv
import re
from collections import Counter
for k in range(0,30):
#url = "http://tieba.baidu.com/f?李毅&ie=utf-8&pn="+ str(k*50)
#s = '李毅'
#s = urllib.parse.quote(s)
req = urllib.request.Request('http://tieba.baidu.com/f?kw=%B1%BE%D6%CA%B0%F4%E6%BB&ie=utf-8&pn='+str(k*50))
#url中有中文会出现错误,可打开网址找到对应转化后字符
response = urllib.request.urlopen(req)
the_page = response.read()
soup = BeautifulSoup(the_page,'lxml')
list1 = []
list2 = []
list3 = []
list4 = []
list5 = []
for tag in soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}):
list1.append("http:tieba.baidu.com"+tag['href'])
list2.append(tag.string)
for tag in soup.find_all(name='span', attrs={"class":re.compile("threadlist_rep_num.*")}):
list3.append(tag.string)
for tag in soup.find_all(name='span',attrs={'class':re.compile('tb_icon_author$')}):
list4.append(tag['title'])
for tag in soup.find_all(name="span", attrs={'class':re.compile('tb_icon_author_rely')}):
list5.append(tag['title'])
data=[]
a=[]
for i in range(0,len(soup.find_all(name='a', attrs={'class':re.compile('j_th_tit')}))):
a.append((list1[i],list2[i],list3[i],list4[i]))
#a = bytes(a,encoding='utf-8')
#data.append(a)
with open('tiezi.csv', 'a',newline='', encoding='utf-8') as csvfile:
#a--add追加文件,注意打开文件方式,wb等会出现二进制错误,
writer = csv.writer(csvfile, dialect='excel')
#writer = csv.writer(csvfile)
writer.writerows(a)
csvfile.close()
print(str(k)+"finished")
csvfile = open('tiezi.csv', 'r+',newline='', encoding='utf-8')
reader = csv.reader(csvfile)
list = []
for line in reader:
list.append(line[3])
dict = Counter(list) #统计
print(dict.most_common(5))#找出前五个