python学习之网页文章爬取与词云生成

最新推荐文章于 2024-01-17 14:58:55 发布

胶鞋老大哥

最新推荐文章于 2024-01-17 14:58:55 发布

阅读量4.9k

点赞数 8

分类专栏： Python学习文章标签： python html

本文链接：https://blog.csdn.net/m0_49986790/article/details/109295570

版权

Python学习专栏收录该内容

5 篇文章 1 订阅

订阅专栏

The second homework the of Map visualization

网页文章爬取与词云生成

作业思路

作业思路

主要分两部分，第一部分是网页爬取文章，第二部分是词频统计与词云生成

第一部分网页爬取文章

过程：分成三步，分别定义三个函数

1.getHtml，用于获取网页内容  
2.getContent，爬取文章标题和正文内容  
3.saveFile,将爬取的文章标题和内容保存到本地

代码:

引用的库

import requests#发起网络请求，交接收返回的服务器数据
import bs4
import os#将数据输出本地文件
from bs4 import BeautifulSoup

getHtml用于获取网页内容

#定义getHtml函数用于获取网页内容
def getHtml(url):
    #伪装浏览器访问
    headers={'user-agent':'Mozilla/5.0 '}
    #用try....except..语句处理异常
    try:
        #获取网页内容
        r=requests.get(url,headers=headers)
        #异常处理语句
        r.raise_for_status()
        #更改编码方式,避免乱码问题
        r.encoding=r.apparent_encoding
        #返回网页内容
        return r.text
    except:
        return "网页获取异常"

getContent用于获得网页内容

#定义getContent函数用于访问文章内容，爬取文章标题和正文
def getContent(html): 
    #解析网页内容
    soup=bs4.BeautifulSoup(html,'html.parser')
    #获取文章标题，标题存放在h1标签中
    title=soup.h1.text
    #获取文章内容，将在class对应值为content的div标签中查找所有p标签，并将搜索到的内容以数组的形式存入plist中。
    plist=soup.find('div',attrs={'class':'content'}).find_all('p')
    #定义数组变量content,循环接收plist中的文本内容
    content='' 
    for i in plist:
        content+=i.text+'n'     
    #将标题和内容整合成文章   
    article=title+content
    #返回文章内容
    return content

saveFile保存爬取到内容

#将爬取的文章内容保存到本地,参数：要保存内容，路径，文件名
def saveFile(content,path,filename):
   #判断是否有这个路径，没有的话，新建一个
    if not os.path.exists(path):
        os.mkdir(path)
    #保存文件    
    with open(filename,'w',encoding='utf-8') as f:#w表示文件只写
        f.write(content)
        f.close()
        print('文件保存成功')

main主函数

def main():
    url="http://news.china.com.cn/2019-09/23/content_75233135.shtml"
    html=getHtml(url)
    content=getContent(html)
    path="C:/Users/Lattee/Desktop/"
    filename="../1.txt"
    saveFile(content,path,filename)
main()

第二部分制作词云

1.读取文本文件
2.利用jieba分词，存入分档
3.词云生成
4.界面显示
5.存成图片

代码：

引用的库

import jieba #引入jieba库用于分词
import wordcloud #词云展示库
import matplotlib.pyplot as plt #图像展示库
import numpy as np #numpy数据处理库
from PIL import Image #图像处理库
import collections #词频统计库
import seaborn as sns

读取文本，jieba分词，统计词频

#打开文本文件,读取内容存放到变量text中
path="1.txt"
f=open(path,'r',encoding='utf-8')#r表示文件只读
text=f.read()
f.close

#处理中文显示
plt.rcParams['font.sans-serif']='SimHei'#中文显示乱码处理，SimHei是黑体

#利用jieba库的精确模式lcut函数分词
sWords=jieba.lcut(text)

#将分词后的文本以空格分开并存入txt
#txt=" ".join(sWords)

#统计词频
wordlist=[]
stopwords={'，','。','的','、','n','和','了','是','要','在','”','“','将','也'}
for word in sWords:
    if word not in stopwords:
        wordlist.append(word)

word_counts=collections.Counter(wordlist)
word_counts_top10=word_counts.most_common(10)

#绘图显示Top10词汇
plt.xlabel('高频词语',fontproperties="SimHei",fontsize=20)
plt.ylabel('出现频率/次',fontproperties="SimHei",fontsize=20)
scale_x=range(10)
x=[]
y=[]
for i in range(len(word_counts_top10)):
    x.append(word_counts_top10[i][0])
    y.append(word_counts_top10[i][1])
    #print("{}\t{}".format(word_counts_top10[i][0],word_counts_top10[i][1]))
#plt.figure(figsize=(15,15))
plt.xticks(scale_x,x,rotation=30)
plt.title("top10词频统计",color='red',fontsize=20)
plt.bar(x,y,width=0.5,color='c')
for i in range(10):
    plt.annotate(y[i],xy=(i,y[i]),xytext=(i,y[i]+1),color='red',ha='center',fontsize=15)
sns.despine()
plt.savefig(fname='count_bar.jpg')

词云生成

#词云生成
mask=np.array(Image.open("timg.png"))
w=wordcloud.WordCloud(font_path="HYZYTJ.ttf",
                      width=1000,
                      height=700,
                      background_color="white",
                      mask=mask,
                      scale=2,                      
                      max_words=100)
w.generate_from_frequencies(word_counts)
#colors=ImageColorGenerator(mask)
#绘制以颜色为背景图颜色参考
#image_color = ImageColorGenerator(mask)
#w.recolor(color_func=image_color)
#设置画布大小
plt.figure(figsize=(20,20))
#将区域分为两部分，一部分显示原图
plt.subplot(121)
plt.imshow(mask)
plt.title('原图',fontproperties='SimHei',fontsize=30)
plt.axis('off')#隐藏坐标轴
#另一部分显示词云图
plt.subplot(122)
plt.title('词云图',fontproperties='SimHei',fontsize=30)
plt.imshow(w)
plt.axis('off')#隐藏坐标轴
plt.show()

#保存文件
w.to_file("1.png")