文章传送门:https://blog.csdn.net/qq_44671752/article/details/104854908
from html.parser import HTMLParser
from re import sub
from sys import stderr
from traceback import print_exc
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import time
import datetime
from apscheduler.schedulers.background import BackgroundScheduler
class _DeHTMLParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
self.__text = []
def handle_data(self, data):
text = data.strip()
if len(text) > 0:
text = sub('[ \t\r\n]+', ' ', text)
self.__text.append(text + ' ')
def handle_starttag(self, tag, attrs):
if tag == 'p':
self.__text.append('\n\n')
elif tag == 'br':
self.__text.append('\n')
def handle_startendtag(self, tag, attrs):
if tag == 'br':
self.__text.append('\n\n')
def text(self):
return ''.join(self.__text).strip()
def dehtml(text):
try:
parser = _DeHTMLParser()
parser.feed(text)
parser.close()
return parser.text()
except:
print_exc(file=stderr)
return text
def getTitle(url):
try:
html = urlopen(url)
return html
except HTTPError as e:
print("连接失败")
return None
def cleandata(path,clean_path):
with open(path, 'r') as f:
with open(clean_path,'w') as clean:
text = f.readlines()
text = list(text)
i = -1
while i <= len(text):
i = i + 1
try:
if text[i][0] + text[i][1] in ['微博', '北京', '今天', '简介', '关注']:
continue
if text[i][3] == "万":
continue
if text[i][2] == "月":
continue
if text[i][0] == "C":
continue
if text[i][4] == "年":
continue
if text[i][-3] == "c":
continue
if text[i][-5]== "热":
continue
if text[i][0] + text[i][1] == '更多':
i = i + 30
continue
clean.write(text[i])
clean.write("\n----------------------------------------------------------------------------------\n")
except IndexError as error:
pass
def main():
#这里的三个路径需要进行修改
#path是未清洗的文本存储位置
#url-path是你需要爬取网页链接的存储位置
#clean_path是存储清洗完数据的存储位置
path = "/Users/xiangxiangyongan/MyFile/txt/weibo.txt"
clean_path = "//Users/xiangxiangyongan/MyFile/txt/weibo_clean.txt"
url_path = "/Users/xiangxiangyongan/MyFile/txt/url.txt"
with open(path,'w') as f:
with open(url_path,'r') as url_read:
url = url_read.readlines()
for i in range(0, len(url)):
urlTest = url[i]
text = getTitle(urlTest)
text = BeautifulSoup(text, features="lxml")
text = str(text)
f.write(dehtml(text))
time.sleep(2)
print("第" + str(i) + "轮完成")
cleandata(path, clean_path)
scheduler = BackgroundScheduler()
scheduler .add_job(main, 'interval', minutes=2, start_date='2020-03-14 8:00:01' , end_date='2020-03-14 8:02:10')
scheduler.start()