import feedparser
import re
import requests
from bs4 import BeautifulSoup
import jieba
def gettitle(url):
# 发送HTTP请求
response = requests.get(url)
# 确保网页请求成功
if response.status_code == 200:
# 使用BeautifulSoup解析网页内容
soup = BeautifulSoup(response.text, 'html.parser')
# 提取标题
title = soup.title.text
print(title)
else:
print('Failed to retrieve the webpage')
title = ''
return title
# Returns title and dictionary of word counts for an RSS feed
def getwordcounts(url):
# Parse the feed
d = gettitle(url) # feedparser.parse(url)
wc = {}
title = ""
# 提取单词
words = jieba.cut(d)
for word in words:
if removesymbols(word):
wc.setdefault(word, 0)
wc[word] += 1
if len(wc) == 1:
title = word
return title, wc
#去除标点符号
def removesymbols(word):
word=word.strip()
if (word != "(") & (word != ")") & (word != "-") & (word != ",") & (word != "!") & (word != ".")& (word != "、")& (word != ""):
return True
return False
def getwords(html):
# Remove all the HTML tags
txt = re.compile(r'<[^>]+>').sub('', html)
# Split words by all non-alpha characters
words = re.compile(r'[^A-Z^a-z]+').split(txt)
# Convert to lowercase
return [word.lower() for word in words if word != '']
apcount = {}
wordcounts = {}
with open('feedlist.txt', 'r') as file:
feedlist = [line for line in file]
for feedurl in feedlist:
try:
title, wc = getwordcounts(feedurl)
wordcounts[title] = wc
for word, count in wc.items():
apcount.setdefault(word, 0)
# 可设置阀值
if count >= 1:
apcount[word] += 1
except:
print('Failed to parse feed %s' % feedurl)
wordlist = []
for w, bc in apcount.items():
frac = float(bc) / len(feedlist)
# 可修改限制范围
if frac > 0.0 and frac < 1.0:
wordlist.append(w)
with open('blog.txt', 'w') as out:
out.write('Blog')
for word in wordlist: out.write('\t%s' % word)
out.write('\n')
for blog, wc in wordcounts.items():
print(blog)
out.write(blog)
for word in wordlist:
if word in wc:
out.write('\t%d' % wc[word])
else:
out.write('\t0')
out.write('\n')
Python集体智慧编程之发现群组--爬取URL
最新推荐文章于 2024-09-26 11:55:47 发布