#########Beautiful Soup###########
## 对于BS4的理解
- Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库,提供一些简单的、python式的函数用来处理导航、搜索、修改分析树等功能。
## BS4的常用操作方法
from bs4 import BeautifulSoup
from bs4 import BeautifulSoup
# 获取标签内容
# 构造对象
soup = BeautifulSoup(open('hello.html'), 'html.parser')
print(soup.tittle)
print(type(soup.title))
print(soup.p)
# 获取标签里面的属性
print(soup.p.attrs) #返回的是一个字典
print(soup.p['id'])
print(soup.p['class'])
print(soup.p['style']) #id class style 默认找到的是第一个P标签
# 对属性进行修改
soup.p['id']='modifyid'
print(soup.p['id'])
soup = BeautifulSoup(open('hello.html'), 'html.parser')
print(soup.head.contents) #head 标签里面的内容
print(soup.head.children) # 返回的是一个迭代对象
for i in soup.head.children:
print('>', i)
soup = BeautifulSoup(open('hello.html'), 'html.parser')
# 查找指定的标签内容(所有的P标签)
res = soup.find_all('p')
print(res)
# 查找标签内容配合正则使用
res1 = soup.find_all(re.compile(r'd+'))
print(res1)
- 详细查找
soup = BeautifulSoup(open('hello.html'), 'html.parser')
print(soup.find_all('p',id=re.compile(r'test\d{1}')))
print(soup.find_all('p',class_=re.compile(r'class\d{1}')))
soup = BeautifulSoup(open('hello.html'), 'html.parser')
# 查找多个标签
print(soup.find_all(['p', 'div']))
print(soup.find_all([re.compile('^d'), re.compile('p')]))
# 内容的匹配
print(soup.find_all(text='文章标题'))
- CSS匹配
soup = BeautifulSoup(open('hello.html'), 'html.parser')
# CSS常见选择器: 标签选择器(div), 类选择器(.class1), id选择器(#idname), 属性选择器(p[type="text"])
# 标签选择器(div)
res1 = soup.select("p")
print(res1)
# 类选择器(.class1)
res2 = soup.select(".class2")
print(res2)
# id选择器(#idname)
res3 = soup.select("#test1")
print(res3)
# 属性选择器(p[type="text"]
print(soup.select("p[id='test1']"))
print(soup.select("p['class']"))
## 获取blog内容并且保存为PDF格式
from bs4 import BeautifulSoup
import requests
import pdfkit
def get_blog_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
# 获取head内容
head = soup.head
# 获取博客标题
title = soup.find_all(class_='article-title-box')[0].get_text()
# 获取博客文章内容
content = soup.find_all(class_="article_content")[0]
with open('blog.html', 'w') as f:
f.write(str(head))
f.write('<h1>%s</h1>\n\n' % (title))
f.write(str(content))
url = 'https://blog.csdn.net/zcx1203/article/details/83030349'
get_blog_content(url)
pdfkit.from_file('blog.html', 'blog.pdf')
import requests
from bs4 import BeautifulSoup
def get_blog_content(url):
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')
head = soup.head
title = soup.find_all(class_="artical-title")
content = soup.find_all(class_='artical-content')
with open('blog_conyent.html', 'w') as f:
f.write(str(head))
f.write(str('<h1>%s</h1>\n\n' % title))
f.write(str(content))
url = 'http://blog.51cto.com/13885935/2296519'
get_blog_content(url=url)
## 案例—douban电影网站
import requests
from bs4 import BeautifulSoup
url = "https://movie.douban.com/cinema/nowplaying/xian/"
# def get_webInfo(url):
# 获取页面信息
response = requests.get(url=url)
content = response.text
# 分析页面,获取ID和电影名称
soup = BeautifulSoup(content, 'html.parser')
# 找到所有的电影信息对应的LI标签
movie_list = soup.find_all('li', class_='list-item')
# movie_list是一个可迭代对象
# print(type(movie_list))
# print(movie_list[0])
# 储存所有电影信息[{'title':'名称','id':'id号'}]
movies_info = []
# 依次遍历每一个li标签,提取所需要的信息
for item in movie_list:
now_movies_dict = {}
# 根据属性获取title和id内容
# item['data-title']获取li标签里面的指定属性data-title对应的value值;
now_movies_dict['title'] = item['data-title']
now_movies_dict['id'] = item['id']
now_movies_dict['actors'] = item['data-actors']
now_movies_dict['director'] = item['data-director']
# 将获取的{'title':"名称", "id":"id号"}添加到列表中;
movies_info.append(now_movies_dict)
with open('movies.txt', 'w') as f:
for item in movies_info:
f.write(str(item) + '\n')
## 获取指定电影的影评信息
# 目标:
# 1). 爬取某一页的评论信息;
# 2).爬取某个电影的前10页评论信息;
# 3). 获取所有电影的评论信息;
import threading
import requests
from bs4 import BeautifulSoup
## 首先实现爬去某一页的影评信息
def getOneComment(id, pageNum):
# 根据页数确定start变量的值
# 第一页: https://movie.douban.com/subject/26425063/comments?start=0&limit=20&sort=new_score&status=P
# 第二页: https://movie.douban.com/subject/26425063/comments?start=20&limit=20&sort=new_score&status=P
# 第三页: https://movie.douban.com/subject/26425063/comments?start=20&limit=40&sort=new_score&status=P
start = (pageNum - 1) * 20
url = 'https://movie.douban.com/subject/%s/comments?start=%s&limit=20&sort=new_score&status=P' % (id, start)
# 爬去网页中的评论信息内容
content = requests.get(url).text
# 通过bs4分析网页
soup = BeautifulSoup(content, 'html.parser')
# 在网页上审查元素 分析出span标签和类short
commentslist = soup.find_all('span', class_='short')
# 定义一个空的字符串
pageConmments = ""
# 依次遍历每一个span标签, 获取标签里面的评论信息, 并将所有的评论信息存储到pageComments变量中;
for commentTag in commentslist:
pageConmments += commentTag.text
print("%s page" % (pageNum))
# 为存储全部评论信息 需要定义一个全局变量
global comments
comments += pageConmments
## 然后我们来爬去前10页的影评信息
id = '26425063'
comments = ""
threads = []
# 爬取前10页的评论信息
for pageNum in range(10):
pageNum = pageNum + 1
# getOneComment(id, pageNum)
# 使用多线程操作
t = threading.Thread(target=getOneComment, args=(id, pageNum))
threads.append(t)
t.start()
_ = [thread.join() for thread in threads]
print("执行结束")
with open("%s.txt" %id, 'w') as f:
f.write(comments)
#- 总的来说:
完整的分析过程:
- 数据的获取: 通过爬虫获取(urllib|requests<获取页面内容> + re|bs4<分析页面内容>)
- 数据清洗: 按照一定的格式对文本进行处理;
## _数据清洗
import re
# 1. 对于爬取的评论信息进行数据清洗(删除不必要的逗号, 句号, 表情, 只留下中文或者英文内容)
with open("./26425063.txt") as f:
comments = f.read()
pattern = re.compile(r'([\u4e00-\u9fa5]+|[a-zA-Z]+)')
deal_comments = re.findall(pattern, comments)
# print(type(deal_comments)) # 返回一个列表
newComments = ""
for item in deal_comments:
newComments += item
print(newComments)
## 词云分析
import jieba
import wordcloud
import numpy as np
from PIL import Image
有时还需导入模块: import matplotlib
注:在python2中处理图像,Image; python3中如果处理图像, 千万不要安装Image, 安装pillow
import jieba
import wordcloud
import numpy as np
from PIL import Image
import matplotlib
# 切割中文,lcut返回一个列表, cut返回一个生成器
result = jieba.lcut(open('./26425063.txt').read())
# 打开图片
imageObj = Image.open('./mao.jpg')
cloud_mask = np.array(imageObj)
# 绘制词云
wc = wordcloud.WordCloud(
mask=cloud_mask,
background_color='black',
font_path='./msyh.ttf', #处理中文数据时
min_font_size=50,
max_font_size=200,
width=500, # 生成图片的宽度
)
wc.generate(",".join(result))
wc.to_file('./douban.png')
虽然巨丑,但是还是达到了预先的效果。
# 值得注意的是:
- 如在lcut切割时有问题 可以使用以下方法
例如:text= "马云曾公开表态称对钱没兴趣称其从来没碰过钱上了微博热搜" , '微博热', '搜'切割有问题
jieba.suggest_freq(('微博'),True)
jieba.suggest_freq(('热搜'),True)
另外, 强调文件中出现的所有词语
jieba.load_userdict('./doc/newWord')