1、首先,通过python,去将读取遍历程序目录文件夹中,【html】文件夹里面的文件、文件
夹以及子目录、子目录里面的 ,获取到该目录下所有的【.html】文件后,返回一个list对象
2、遍历完成后得到一个html文件列表对象,将该列表交给html_to_txt方法,html_to_txt方法
里面循环逐个读取html文件中指定标签中
标签中
里面
标签中的文字,和中指定标签
标签的文字提取出来
3、读取到的文本内容输出到txt文件中,这里可以加上一个替换replace,把我们不需要的内
容替换之后,这里可以做多次替换,也可以加上换行之类的处理,再进行输出,可根据自己
的需求修改,如果有什么不明之处,可以提问
main.py
```python
import glob
import os
import re
import pypandoc
from selectolax.parser import HTMLParser
from html.parser import HTMLParser
from lxml import etree
from Html_To_txt import html_to_txt
# 解析本地html,返回字典数据类型
def parse_html(file_path, vla=None):
for ff in file_path:
val = []
with open(ff, 'r', encoding='gbk') as f:
html = etree.HTML(f.read())
Title = html.xpath("//*[@id='left']/div/div[2]/h1/text()")
contents = html.xpath("//td[@class='info_content']/*")
val.append(Title)
for td in contents:
val.append(td.text)
a = html_for(td.xpath("./strong/a"))
b = html_for(td.xpath("./strong/tail()"))
vla.append(a.text)
vla.append(b.text)
txt = open(os.getcwd() + "\\txt\\" + ff.split('\\')[-1], 'w', encoding="utf-8")
txt.write(val)
txt.close()
# res = {}
# for div in divs:
# key = div.xpath("./span[1]/text()")[0].replace('/', '_')
# value = div.xpath("./span[2]/text()")[0]
# res[key] = value
def html_for(html_obj):
aaa_ls = []
if len(html_obj) > 2:
for bbb in html_obj:
if len(bbb) > 2:
aaa_ls.append(html_for(bbb).text)
else:
aaa_ls.append(bbb.text)
else:
a = html_obj[0].text
b = html_obj[0].tail
aaa_ls.append(str(html_obj[0].text + html_obj[0].tail))
return aaa_ls
def search_dir(pathstr, file_all=[]):
files = os.listdir(pathstr) # 得到文件夹下的所有文件名称
# print(files)
for file_str in files: # 遍历该文件夹
if os.path.isdir(pathstr + "\\" + file_str): # 是子文件夹
search_dir(pathstr + "\\" + file_str)
else: # 是文件
if os.path.splitext(file_str)[1] == '.html':
print(pathstr + "\\" + file_str)
file_all.append(pathstr + "\\" + file_str)
return file_all
# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
path = os.getcwd() + r'\html'
aaa = search_dir(path)
html_to_txt(aaa)
Html_To_txt.py
import os
from lxml import etree
from sgmllib import SGMLParser
class GetIdList(SGMLParser):
# def __init__(self, verbose=0):
# super().__init__(verbose)
# self.verbatim = 0
# self.getdata = False
# self.flag = False
# self.IDlist = []
def reset(self):
self.IDlist = []
self.flag = False
self.getdata = False
self.verbatim = 0
SGMLParser.reset(self)
# def start_div(self, attrs):
# if self.flag:
# self.verbatim += 1 # 进入子层div了,层数加1
# return
# for k, v in attrs: # 遍历div的所有属性以及其值
# if k == 'class' and v == 'entry-content': # 确定进入了<div class='entry-content'>
# self.flag = True
# return
#
# def end_div(self): # 遇到</div>
# if self.verbatim == 0:
# self.flag = False
# if self.flag: # 退出子层div了,层数减1
# self.verbatim -= 1
def start_div(self, attrs):
if self.flag:
self.verbatim += 1 # 进入子层td了,层数加1
return
for k, v in attrs: # 遍历div的所有属性以及其值
if k == 'class' and v == 'article-content': # 确定进入了<div class='entry-content'>
self.flag = True
return
def end_div(self): # 遇到</td>
if self.verbatim == 0:
self.flag = False
if self.flag: # 退出子层td了,层数减1
self.verbatim -= 1
# def start_td(self, attrs):
# if self.flag:
# self.verbatim += 1 # 进入子层td了,层数加1
# return
# for k, v in attrs: # 遍历div的所有属性以及其值
# if k == 'class' and v == 'info_content': # 确定进入了<div class='entry-content'>
# self.flag = True
# return
#
# def end_td(self): # 遇到</td>
# if self.verbatim == 0:
# self.flag = False
# if self.flag: # 退出子层td了,层数减1
# self.verbatim -= 1
def start_h1(self, attrs):
if not self.flag:
return
self.getdata = True
def end_h1(self): # 遇到</p>
if self.getdata:
self.getdata = False
def start_p(self, attrs):
if not self.flag:
return
self.getdata = True
def end_p(self): # 遇到</p>
if self.getdata:
self.getdata = False
def handle_data(self, text): # 处理文本
if self.getdata:
self.IDlist.append(text)
def printID(self, new_file):
f = open(os.getcwd() + "\\txt\\" + (new_file.split('\\'))[-1] + '.txt', 'w', encoding='gbk')
j = 0
for i in self.IDlist:
print(i)
if '。' in i or j == 0:
f.write(i + '\n')
j += 1
else:
f.write(i )
##import urllib2
##import datetime
##vrg = (datetime.date(2012,2,19) - datetime.date.today()).days
##strUrl = 'http://www.nod32id.org/nod32id/%d.html'%(200+vrg)
##req = urllib2.Request(strUrl)#通过网络获取网页
##response = urllib2.urlopen(req)
##the_page = response.read()
def html_to_txt(html_list):
for a in html_list:
the_page = a
ff = open(the_page, 'r',encoding='utf-8')
html = ff.read()
lister = GetIdList()
lister.feed(html)
lister.printID(a)