爬取豆瓣网信息
#定义爬取豆瓣网信息函数
import requests
from bs4 import BeautifulSoup
def parse_html(book):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}
response = requests.get(f'https://www.douban.com/search?q=' + book ,headers=headers)
#response = requests.get(f'https://book.douban.com/top250?start={num}', headers=headers)
soup = BeautifulSoup(response.text, 'lxml')
#print(soup)
#类型和名称
all_Type = soup.find_all('h3')
book_names = [Type.get_text() for Type in all_Type]
leixing = []
mingcheng = []
for piece in book_names:
if "小组" in piece:
continue
elif "日记" in piece:
continue
else: #首先去除爬取到的多余信息
piece = piece.strip().replace(' ', '')
piece = piece.strip().replace('[', '')
piece = piece.strip().replace('可播放', '')
piece = piece.strip().replace('可试读', '')
piece = piece.strip().replace('有电子版', '')
piece = piece.strip().replace('\n', '')
piece = piece.strip().replace('\xa0', '')
str1 = piece.split(']')
if book in str1[0]:
continue
else:
leixing.append(str1[0])
mingcheng.append(str1[1])
#for i in str1:
# print(i)
#print(len(leixing))
#print(mingcheng)
#出版年份ok
all_year = soup.find_all('span', class_='subject-cast')
nianfen = []
book_years = [year.get_text() for year in all_year]
for piece in book_years:
str1 = piece.split(' ')
try:
nianfen.append(int(str1[-1])) #在获得数据中摘取出版年份
except: #抛出异常,排除没有出版年份(即对应位置数据不能转换成int型数据)的情况
nianfen.append(0)
#print(len(nianfen))
#评价人数ok
all_people = soup.find_all('span',class_='')
book_people = [people.get_text() for people in all_people]
renshu = []
i=0
for piece in book_people:
if '[' not in piece:
i +=1
else:
break
book_people = book_people[i::] #摘除开头多余的信息
for piece in book_people:
if '日记' in piece:
continue
elif '小组' in piece:
continue
#print(piece)
elif '人评价' in piece: #摘取含有评价人数的元素
piece = piece.strip('(').strip(')')
piece = piece.strip("人评价") #去除字符串中多余指定字符
renshu.append(int(piece))
elif "(" in piece: #对于“暂未上映”或者“尚无评价”的数据,将评价人数赋为0
renshu.append(0)
#else:
#renshu.append(0)
#print(type(book_people))
#print(len(renshu))
#print(renshu)
#评分ok
all_mark = soup.find_all('span', class_='rating_nums')
book_rates = [mark.get_text() for mark in all_mark]
#print(len(book_rates))
pingfen = []
for i in range(len(mingcheng)): #不能用len(book_rates),某些无评分的数据未导入,book_rates长度比其他列表短,会造成信息错乱
if renshu[i] ==0:
pingfen.append(float(0)) #对于无评分的数据将评分补成0
book_rates.insert(i,'0') #
else:
pingfen.append(float(book_rates[i]))
#print(len(pingfen))
#print(len(mingcheng))
#print(pingfen)
content = []
content = [leixing, mingcheng, nianfen, renshu, pingfen]
return content
#为方便观察爬取到的结果,将数据写入csv文件,定义写入格式
import csv
import pandas as pd
#csv 写入
def write_(book,content):
#Type.append('type')
#Name.append('name')
content[0].insert(0,"类型")
content[1].insert(0,"名称")
content[2].insert(0,"出版年份")
content[3].insert(0,"评价人数")
content[4].insert(0,"评分")
file1= open(book + '.csv','a+', newline='')
#a+:打开一个文件用于读写。如果该文件已存在,文件指针将会放在文件的结尾。文件打开时会是追加模式。如果该文件不存在,创建新文件用于读写。
#设定写入模式
csv_write = csv.writer(file1,dialect='excel')
#写入具体内容
#dataframe1 = pd.DataFrame({'Type':Type,'Name':Name})
for val in zip(content[0],content[1],content[2],content[3],content[4]):
csv_write.writerow(val)
#csv_write.writerow(Type)
#csv_write.writerow(Name)
file1.close()
print ("write over")
#从豆瓣网上爬取四大名著相关产品的数据
if __name__ == '__main__':
print('begin')
data1 = parse_html("红楼梦")
data2 = parse_html("水浒传")
data3 = parse_html("三国演义")
data4 = parse_html("西游记")
print(data1)
print(data2)
print(data3)
print(data4)
print('end')
写入csv中
#从豆瓣网上爬取四大名著相关产品的数据
if __name__ == '__main__':
print('begin')
data1 = parse_html("红楼梦")
data2 = parse_html("水浒传")
data3 = parse_html("三国演义")
data4 = parse_html("西游记")
print(data1)
print(data2)
print(data3)
print(data4)
print('end')
CSV文件操作
#定义函数,读取csv文件并将信息按列分离
import os
import pandas as pd
import csv
def open_data(book):
mxdPath=r"C:\\Users\\18172\\Jupyter\\" + book + r".csv"
file=open(mxdPath)
content = csv.reader(file) ##reader(f)读取文件中的一行,read()只能读取一个字符
data = []
Type = []
Name = []
Year = []
People = []
Mark = []
for piece in content:
data.append(piece)
#print(data)
for piece in data[1::]:#第一行为各列名称,所以从第二行开始截取,即data[1]
Type.append(piece[0])
Name.append(piece[1])
Year.append(piece[2])
People.append(piece[3])
Mark.append(piece[4])
#print(content)
file.close()
content = [Type, Name, Year, People, Mark]
return content
#打开四个表格
content1 = open_data('红楼梦')
print(content1[0])
content2 = open_data('西游记')
print(content2[0])
content3 = open_data('水浒传')
print(content3[0])
content4 = open_data('三国演义')
print(content4[0])
分类统计
种类饼状图
#定义函数,对于爬到的产品类型进行分析
def ana_type(book,Type):
type_dict = {}
for data in Type: #统计各个类型的产品数量,不同类型作为字典的key
if data not in type_dict.keys(): #如果该key不存在,则将value值初始化为1
type_dict[data] = 1
else: #如果该key存在,则将value值加1
type_dict[data] +=1
#print (type_dict)
num = []
typename = []
explode = []
print(type_dict)
for key,value in type_dict.items():
typename.append(key)
num.append(value)
for name in typename:
explode.append(0)
explode = tuple(explode)
#for key in type_dict
drawpie(book,typename, num,explode)
return num
#定义函数,绘制饼状图
import matplotlib.pyplot as plt
from pylab import mpl#字体
#设置字体
def drawpie(book,name,num,explode):#画饼状图
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.title("豆瓣网《" + book + "》相关产品种类分布情况")
sizes=num
colors='lightgreen','gold','lightskyblue','lightcoral'
#explode=0,0,0
#print(type(explode))
plt.pie(sizes,explode=explode,labels=name,
colors=colors,autopct='%1.1f%%',shadow=True,startangle=50)
plt.axis('equal')
plt.show()
#分析四大名著相关产品的类型分布,并绘制对应的饼状图,类型为表格中的第一列(content[0])
num1 = ana_type("红楼梦",content1[0])
num2 = ana_type("西游记",content2[0])
num3 = ana_type("水浒传",content3[0])
num4 = ana_type("三国演义",content4[0])
出版时间折线图
#定义函数,对于爬到的产品出版年份进行分析
def ana_year(book,Year):
num = [0,0,0,0,0]
for data in Year:
data = int(data)
if data < 1979:
num[0] += 1
elif data < 1990:
num[1] += 1
elif data < 2000:
num[2] += 1
elif data < 2010:
num[3] +=1
else:
num[4] += 1
#print (type_dict)
print(num)
drawyear(book,num)
#定义函数,绘制出版年份柱状图
import matplotlib.pyplot as plt
from pylab import mpl#字体
#解决中文显示问题
from matplotlib import mlab
from matplotlib import rcParams
def drawyear(book, count):#画折线图
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.title("《" + book + "》相关产品出版年份统计")
name_list = ['1979年以前', '1979-1990', '1990-2000', '2000-2010', '2010年之后']
plt.xlabel("时间")
plt.ylabel("新出版产品数量")
#y1= [2,4,7,2,4]
plt.plot(name_list, count)
#plt.plot(x, y2)
#plt.xticks((0,1,2,3,4),('1979年以前', '1979-1990', '1990-2000' , '2000-2010', '2010年之后'))
#plt.bar(x = (0,1,2,3),height = distance,width = 0.35,align="center")
#rect = plt.bar(x = (0,1,2,3,4),height = count,width = 0.35,align="center")
#autolabel(rect)
plt.show()
#print(count)
return count
#分析出版年份,对应表格中的第三列(content[2])
ana_year("红楼梦",content1[2])
ana_year("西游记",content2[2])
ana_year("水浒传",content3[2])
ana_year("三国演义",content4[2])
评价柱状图
#定义函数,分析四大名著的评价人数和得分情况
def ana_mark(people, mark):
sum_ = [0,0,0,0]
mark_ = [0,0,0,0]
for i in range(4):
for data in people[i]:
if data == 0:
contimue
else:
sum_[i] += int(data)
for data in mark[i]:
if data == 0:
contimue
else:
mark_[i] += float(data)
sum_[i] = sum_[i] / len(people[i])
mark_[i] = mark_[i] / len(mark[i])
print(sum_)
drawmark(mark_)
draw_mark(sum_,mark_)
#定义函数,画四大名著得分比较的柱状图
def autolabel(rects):#在柱状图上显示数量坐标
for rect in rects:
height = rect.get_height()
plt.text(rect.get_x()+0.14, 1.03*height, '%s' % height)
def drawmark(mark):
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.title("四大名著相关出版物平均得分统计")
name_list = ['红楼梦', '西游记', '水浒传', '三国演义']
plt.xlabel("书名")
plt.ylabel("得分")
plt.xticks((0,1,2,3),('红楼梦', '西游记', '水浒传', '三国演义'))
#print(type(mark[0]))
#plt.bar(x = (0,1,2,3),height = distance,width = 0.35,align="center")
#my_y_ticks = np.arange(0, 7, 0.05)
#plt.yticks(my_y_ticks)
for i in range(4):
mark[i] = round(mark[i],2) #将评分保留至小数点后两位
#print(data)
print(mark)
rect = plt.bar(x = (0,1,2,3),height = mark,width = 0.35,align="center",color = 'orange')
#rect = plt.bar(x = (0,1,2,3,4),height = count,width = 0.35,align="center")
height = mark
#print(type(height[0]))
autolabel(rect) #显示柱状图的数值
plt.show()
#print(count)
#return count
#定义函数,绘制并列柱状图,分析评价人数和得分
import matplotlib.pyplot as plt
import numpy as np
def autolabel2(rects2):#在柱状图上显示数量坐标
for rect in rects2:
height = round(rect.get_height(),2)
print(height)
plt.text(rect.get_x()+0.05, 1.03*height, '%s' % round(height/5000,2)) #设置显示坐标位置和数值
def draw_mark(sum_, mark_):
name_list = ['红楼梦', '西游记', '水浒传', '三国演义']
y_list = sum_
for i in range(4):
mark_[i] = round(mark_[i],2) #将评分保留至小数点后两位
#print(mark_[i])
mark_[i] = 5000 * mark_[i]
#print(mark_[i])
#print(sum_)
#print(mark_)
y_list2 = mark_
#bar_width = 0.3
#size = 4
#x = np.random.random(len(x_data))
x =list(range(len(y_list)))
total_width, n = 0.8, 2
width = total_width / n
#绘制柱状图
# 在柱状图上显示具体数值, ha参数控制水平对齐方式, va控制垂直对齐方式
#for x, y in enumerate(y_data):
#plt.text(x, y + 100, '%s' % y, ha='center', va='bottom')
#for x, y in enumerate(y_data2):
#plt.text(x+bar_width, y + 100, '%s' % y, ha='center', va='top')
rect = plt.bar(x, y_list, width=width, label='平均评论人数',fc = 'y')
for i in range(len(x)):
x[i] = x[i] + width
rect2 = plt.bar(x, y_list2, width=width, label='平均得分',tick_label = name_list,fc = 'r')
plt.legend()
#plt.show()
#设置标题
plt.title("豆瓣网四大名著相关评论数和得分情况")
plt.xlabel("书籍")
plt.ylabel("数量")
autolabel2(rect2)
#plt.legend()
plt.show()
#分析四大名著相关出版物平均得分和评价人数
people = [content1[3],content2[3],content3[3],content4[3]]
mark = [content1[4],content2[4],content3[4],content4[4]]
ana_mark(people,mark)
2.
def ana_typemark(book, type_, num, mark_): #type_为各类书籍的类型,mark_为得分列表,num为不同类型的产品数量
type_dict = {}
for i in range(1,len(type_)):#遍历列表中的各个元素
if type_[i] not in type_dict.keys(): #如果该key不存在,则将value值初始化为该组数据得分
type_dict[type_[i]] = float(mark_[i])
else: #如果该key存在,则将value值加1
type_dict[type_[i]] += float(mark_[i])
sum_ = []
typename = [] #类型名称列表
average = [] #各个类型对应的平均分
explode = []
for key,value in type_dict.items():
typename.append(key)
sum_.append(value)
explode.append(0)
i = 0
for key,value in type_dict.items():
average.append(value/num[i])
i += 1
explode = tuple(explode)
drawtype(book,typename, average,explode)
#定义函数,画四大名著不同类型得分比较的柱状图
def drawtype(book,typename, average,explode):
mpl.rcParams['font.sans-serif'] = ['SimHei']
plt.title("《" + book + "》不同类型出版物平均得分统计")
name_list = typename
plt.xlabel("类型")
plt.ylabel("平均得分")
list1 = []
for i in range(len(typename)):
list1.append(i)
list1 = tuple(list1)
typename = tuple(typename)
#plt.xticks((0,1,2,3),('红楼梦', '西游记', '水浒传', '三国演义'))
plt.xticks(list1,typename) #参数要求为元组,强制类型转换
#print(type(mark[0]))
#plt.bar(x = (0,1,2,3),height = distance,width = 0.35,align="center")
#my_y_ticks = np.arange(0, 7, 0.05)
#plt.yticks(my_y_ticks)
for i in range(len(typename)):
average[i] = round(average[i],2) #将评分保留至小数点后两位
#print(data)
print(average)
rect = plt.bar(x = list1,height = average,width = 0.35,align="center",color = 'lightblue')
#rect = plt.bar(x = (0,1,2,3,4),height = count,width = 0.35,align="center")
height = average
#print(type(height[0]))
autolabel(rect) #显示柱状图的数值
plt.show()
#print(count)
#return count
ana_typemark("红楼梦", content1[0], num1, content1[4])
ana_typemark("西游记", content2[0], num2, content2[4])
ana_typemark("水浒传", content3[0], num3, content3[4])
ana_typemark("三国演义", content4[0], num4, content4[4])
名著高频词
#分别统计四本书中出现次数最多的二十个词组
import os
def maxword(book, path):
exclude_str = ",。!?、()【】<>《》=:+-*—“”‘’… ;\n了你我她他的\u3000不一来人这儿是下此着个子下上曰之有" #除去标点符号和无用字符
f = open(path,encoding = 'gb18030')
content = []
for line in f: #将每个词组存储到列表中
#line = list(line)
for i in range(len(line)-1): #除去标点符号和无用字符
if line[i] not in exclude_str:
if line[i+1] not in exclude_str:
content.append(line[i]+line[i+1])
count={} #统计每个字出现的数量
for character in content:
count.setdefault(character,0) #如果键不存在于字典中,将会添加键并将值设为默认值
count[character] = count[character] + 1
# 排序
# x[1]是按字频排序,x[0]则是按字排序
lstWords = sorted(count.items(), key=lambda x:x[1], reverse=True)
print(book + "中出现频率最高的二十个词组为:")
print ('字符\t字频')
for e in lstWords[:20]:
#print(e)
print ('%s\t%d' % e)
#print(count)
f.close()
maxword("《红楼梦》", r"C:\Users\18172\Jupyter\红楼梦.txt")
maxword("《西游记》", r"C:\Users\18172\Jupyter\西游记.txt")
maxword("《水浒传》", r"C:\Users\18172\Jupyter\水浒传.txt")
maxword("《三国演义》", r"C:\Users\18172\Jupyter\三国演义.txt")
感悟
最开始学习python是从python3开始,做爬虫也是用python3,首先,python3的语法必须知道,不过python3并不难,语法也非常简洁。但是,写着发现有个毛病,就是比如一个形参,由于不确定类型,.无法像java那样补全,导致你如果忘了一个函数,必须查看代码或手册。。。不过这也是动态语言都有的问题,但好的IDE会帮你记录,所以使用好的IDE很有必要。。哈哈。
然后是学习python的各种库,为了打好基础,从基础的库开始学习会比较好,比如urlib,然后学习requests,这种是用来发请求的。返回的数据又是各种各样的类型,有html,js,json,字符串等,针对每种格式,当然都需要合适的解析器,当然,正则都能做。。。这里,解析html用到xpath,beautifulsoup,pyquery等库,js代码需要js引擎来运行和分析,例如nodejs,v8,json的解析要用到json库,字符串就看具体情况了。