爬取豆瓣应用的函数同样可以用来爬取新闻。这里面主要是正则表达式的提取和对爬出的链接再进行爬取解析。对网页不
import re # 正则
from bs4 import BeautifulSoup # 网页解析
import urllib.request, urllib.error # 制定url获取网络数据
def main():
baseurl = " " #新闻总网站选取爬取网站网址
getData(baseurl) # 函数调用
#生成词云
findlink = re.compile(r'<a href="(.*?)" target="_blank">') # 分网站链接
findword= re.compile(r'<p>(.*?)</p>') # 内容
def getData(baseurl):
# 爬取
links = []
url=baseurl
html = askURL(url)
#print(html)
# 解析
soup = BeautifulSoup(html, "html.parser")
#print(soup)
try:
for item in soup.find_all('li',class_="clearfix"):#不同网站内容不同
data = []
item = str(item)
link=str(re.findall(findlink,item)[0])
html=askURL(link)
Soup=BeautifulSoup(html, "html.parser")
Soup=str(Soup )
word=str(re.findall(findword,Soup ))
word=str(re.findall(r'[\u4e00-\u9fa5]',word))
word=word.replace("'",'')
word = word.replace(",", '')
word = str(word.replace(" ", ''))
with open("新闻.txt", "a+") as f: # 写入txt
f.write(word)
links.append(data)
for item1 in soup.find_all('div', class_="list-focus"): #选择爬取部份
data1 = []
item1 = str(item1)
link = str(re.findall(findlink, item1)[0])
html = askURL(link)
Soup1 = BeautifulSoup(html, "html.parser") # 解析那四个
Soup1 = str(Soup1)
word = str(re.findall(findword, Soup1))
word = str(re.findall(r'[\u4e00-\u9fa5]', word)) # 提取所有汉字
word = word.replace("'", '')
word = word.replace(",", '')
word = str(word.replace(" ", ''))
with open("新闻.txt", "a+") as f:#写入txt
f.write(word)
links.append(data1)
for i in range(0, 300):
url = baseurl
html = askURL(url)
for item2 in soup.find_all('li', class_="active"): #选择爬取部分
data2 = []
item2 = str(item2)
link = str(re.findall(findlink, item2)[0])
html = askURL(link)
Soup2 = BeautifulSoup(html, "html.parser") # 解析
Soup2 = str(Soup2)
word = str(re.findall(findword, Soup2))
word = str(re.findall(r'[\u4e00-\u9fa5]', word)) # 提取所有汉字
word = word.replace("'", '')
word = word.replace(",", '')
word = str(word.replace(" ", ''))
with open("新闻.txt", "a+") as f:#写入txt
f.write(word)
links.append(data2)
except Exception as e:
print(e)
return
def askURL(url):
head = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.42"
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
if __name__ == "__main__":
main()
print("爬取完毕")
同的地方位置重复爬取。我这里对新闻不同位置元素爬取。多次用解析函数。别的都是按正常的。
首先用来隐藏成正常的电脑IP。在head那。然后用库函数访问。并指出别的情况。正则表达式主要用F12进行查取。取消不必要的部份。
后面是词云生成
import re
import jieba
from wordcloud import WordCloud
import cv2.cv2 as cv
def analysis(savepath):
f = open(savepath, 'r', encoding='gbk')
result = f.read()
result = re.sub('[a-zA-Z0-9"#$%&\'()*+,-./::;""()<=>?@,。?、…【】《》?![\\]^_`{|}~\s]+', '', result)#消除符号和换行
words = jieba.lcut(result)
string = []
for word in words:
if len(word) == 1:
continue
else:
string.append(word)
strings =' '.join(string)
mk = cv.imread(' ')#背景图形的填入
w = WordCloud(font_path="C:\Windows\Fonts\simhei.ttf", background_color="Red", width=1000, height=600, mask=mk,
max_words=500, colormap="autumn",
stopwords={' '})#这里依次是是所用字体,在文件夹里寻找可以更改。然后是背景颜色,定义长宽。##使用图形。定义最大词数,字体颜色定义。
w.generate(strings)
w.to_file('wordcloud.jpg')#生成词云图
image = w.to_image()
image.show()
def main():
savepath = "新闻2.txt"
analysis(savepath)
print("finish")
if __name__ == "__main__":
main()
用jiaba函数进行分词。然后把内容用循环遍历转成字符串。用词云函数代码中有注释。然后生成词云图。大体就是这样,不过在爬取时可以简化。从总的爬取。所有的新闻都有一个相同点可以用来爬取。
stopwords={' '}用来选择词云中不显现的词语