一、下载需要的词库,整理词库数据(读取和去重)
# 多个.scel合并为同一dict,并去重
import struct
import os
# 拼音表偏移,
startPy = 0x1540;
# 汉语词组表偏移
startChinese = 0x2628;
# 全局拼音表
GPy_Table = {}
# 解析结果
# 元组(词频,拼音,中文词组)的列表
GTable = []
# 原始字节码转为字符串
def byte2str(data):
pos = 0
str = ''
while pos < len(data):
c = chr(struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0])
if c != chr(0):
str += c
pos += 2
return str
# 获取拼音表
def getPyTable(data):
data = data[4:]
pos = 0
while pos < len(data):
index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
pos += 2
lenPy = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
pos += 2
py = byte2str(data[pos:pos + lenPy])
GPy_Table[index] = py
pos += lenPy
# 获取一个词组的拼音
def getWordPy(data):
pos = 0
ret = ''
while pos < len(data):
index = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
ret += GPy_Table[index]
pos += 2
return ret
# 读取中文表
def getChinese(data):
pos = 0
while pos < len(data):
# 同音词数量
same = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# 拼音索引表长度
pos += 2
py_table_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# 拼音索引表
pos += 2
py = getWordPy(data[pos: pos + py_table_len])
# 中文词组
pos += py_table_len
for i in range(same):
# 中文词组长度
c_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# 中文词组
pos += 2
word = byte2str(data[pos: pos + c_len])
# 扩展数据长度
pos += c_len
ext_len = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# 词频
pos += 2
count = struct.unpack('H', bytes([data[pos], data[pos + 1]]))[0]
# 保存
GTable.append((count, py, word))
# 到下个词的偏移位置
pos += ext_len
def scel2txt(file_name):
# 分隔符
print('-' * 60)
# 读取文件
with open(file_name, 'rb') as f:
data = f.read()
print("词库名:", byte2str(data[0x130:0x338])) # .encode('GB18030')
print("词库类型:", byte2str(data[0x338:0x540]))
print("描述信息:", byte2str(data[0x540:0xd40]))
print("词库示例:", byte2str(data[0xd40:startPy]))
getPyTable(data[startPy:startChinese])
getChinese(data[startChinese:])
# 每行只保留一个单独的词,去重
def dict(path):
out_path = path[:-4] + '去重' + path[-4:]
f = open(path, 'r')
out = open(out_path, "w")
input = f.readlines()
dict = []
for i in input:
a, b, c = i.split('\t\t\t')
# b = b.strip()
if c not in dict:
dict.append(c)
out.write(str(c))
out.close()
if __name__ == '__main__':
# scel所在文件夹路径
in_path = "./scel文件/西药学"
fin = [fname for fname in os.listdir(in_path) if fname[-5:] == ".scel"]
for f in fin:
f = os.path.join(in_path, f)
scel2txt(f)
# 保存文件夹
out_path = './scel文件/西药学/西药学.txt'
f = open(out_path, 'w')
for count, py, word in GTable:
# GTable保存着结果,是一个列表,每个元素是一个元组(词频,拼音,中文词组),有需要的话可以保存成自己需要个格式
# 我没排序,所以结果是按照上面输入文件的顺序
# f.write((str(count) + '\t\t\t' + py + '\t\t\t' + word).encode('GB18030') + '\n')
f.write(str(count) + '\t\t\t' + py + '\t\t\t' + word + '\n')
f.close()
dict(out_path)
二、 爬取词库中的词条数据
from tqdm import tqdm
import requests
import urllib, time, random
def write(filename, data):
with open(filename, 'a', encoding='utf-8') as f:
for line in tqdm(data):
f.write(str(line)+'\n')
# coding:utf-8
# coding:utf-8
def content_data(name, aa):
aa = aa.replace('播报', '').replace('编辑', '').replace('\n[1]', '').replace('\n[2]', '').replace('\n[3]', '')
for _ in range(30):
aa = aa.replace("\n" * 2, '\n')
index0 = aa.find('目录')
if index0 == -1:
aa = ''
aa = aa[index0:]
for i in ["V百科往期回顾", "参考资料", "更多图册", "词条图册"]:
index1 = aa.find(i)
if index1 != -1:
aa = aa[:index1]
mulu = []
for j in range(20):
if j == 0:
s0 = aa.find('\n')
c0 = aa[:s0]
aa = aa[s0 + 1:]
# print(s0, "c0", c0, '*'*20)
else:
s0 = aa.find('\n')
c0 = aa[:s0]
# aa = aa[s0+1:]
if str(j) == c0:
s1 = aa.find('\n', s0 + 1)
c1 = aa[s0 + 1:s1]
mulu.append(c1)
# print(c1, '-' * 20)
else:
break
aa = aa[s1 + 1:]
# print(s0, "c0", c0, '*' * 20)
s0 = aa.find(name + mulu[0])
aa = aa[s0:]
for index, i in enumerate(mulu):
aa = aa.replace(name + i, str(index + 1) + '、' + name + i + ':')
return aa
# url管理器(Urlmanager.py)代码如下:
class Urlmanager(object):
def __init__(self):
self.new_urls=set()
self.old_urls=set()
def has_new_url(self):
'''
判断是否有未爬去的URL
:return
'''
return self.new_url_size()!=0
def get_new_url(self):
'''
获取一个未爬去的URL
:return:
'''
new_url=self.new_urls.pop()
self.old_urls.add(new_url)
return new_url
def add_new_url(self,url):
'''
将新的url添加到未爬去的URL集合中
:param url: 单个URL
:return:
'''
if url is None:
return
if url not in self.new_urls and url not in self.old_urls:
self.new_urls.add(url)
def add_new_urls(self,urls):
'''
将新的url添加到未爬去的URL集合中
:param urls: url集合
:return:
'''
if urls is None or len(urls)==0:
return
for url in urls:
self.add_new_url(url)
def new_url_size(self):
'''
获取未爬取URL集合大小
:return:
'''
return len(self.new_urls)
def old_url_size(self):
'''
获取已爬取URL集合的大小
:return:
'''
return len(self.old_urls)
import requests
class HtmlDownloader(object):
def download(self,url):
if url is None:
return None
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
headers={'User-Agent':user_agent}
r=requests.get(url,headers=headers)
if r.status_code==200:
r.encoding='utf-8'
return r.text
return None
# coding utf-8
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
class HtmlParser(object):
def parser(self,page_url,html_cont):
'''
用于解析网页内容,提取url和数据
:param page_url: 下载页面的URL
:param html_cont: 下载的网页内容
:return: 返回URL和数据
'''
if page_url is None or html_cont is None:
return
soup=BeautifulSoup(html_cont,'html.parser')
# new_urls=self._get_new_urls(page_url,soup)
new_data=self._get_new_data(page_url,soup)
return new_data
# def _get_new_urls(self,page_url,soup):
# '''
# 抽取新的URL集合
# :param page_url:下载页面的URL
# :param soup: soup
# :return: 返回新的URL集合
# '''
# new_urls=set()
# # links=soup.find_all('a',href=re.compile(r'/item/\.*?/\d+\.html'))
# links=soup.find_all('a',href=re.compile(r'^/item/[\%\w{2}]+/\d+'))
#
# for link in links:
# # 提取href属性
# new_url=link['href']
# # 拼接成完整网址
# new_full_url=urljoin(page_url,new_url)
# new_urls.add(new_full_url)
# return new_urls
def _get_new_data(self,page_url,soup):
'''
抽取有效数据
:param page_url:下载页面的URL
:param soup:
:return: 返回有效数据
'''
data={}
# data['url']=page_url
title=soup.find('dd',class_='lemmaWgt-lemmaTitle-title').find('h1')
data['title']=title.get_text()
summary=soup.find('div',class_='lemma-summary').get_text()
# 获取tag中包含的所有文本内容,包含子孙tag中的内容,并将结果作为Unicode字符串返回
data['summary']=summary
# poster_bottom = soup.find('div',class_='poster-bottom')
# if poster_bottom is not None:
# data['poster_bottom'] = poster_bottom.get_text()
text = soup.get_text()
# text数据处理
text = content_data(data['title'], text)
data['content'] = text
# para = soup.find_all('div', class_='para-title')
# print(len(para))
# for each in para:
# movie = each.get_text()
# print(movie,type(movie))
# para = soup.find_all('div', class_='para')
# print(len(para))
# for each in para:
# movie = each.get_text()
# print(movie, type(movie))
# if para is not None:
# data['para'] = para.get_text()
return data
#coding=utf-8
import codecs
class DataOutput(object):
def __init__(self):
self.datas=[]
def store_data(self,data):
if data is None:
return
self.datas.append(str(data))
def output_html(self):
write('./中医.txt', self.datas)
# fout=codecs.open('baike.html','w',encoding='utf-8')
# fout.write("<html>")
# fout.write("<body>")
# fout.write("<table>")
# for data in self.datas:
# fout.write("<tr>")
# fout.write("<td>%s.html</td>"%data['url'])
# fout.write("<td>%s</td>"%data['title'])
# fout.write("<td>%s</td>"%data['summary'])
# fout.write("<td>%s</td>" % data['poster_bottom'])
# fout.write(str(data)+'\n')
# fout.write("<tr>")
# self.datas.remove(data)
# fout.write("</table>")
# fout.write("</body>")
# fout.write("</html>")
# fout.close()
class SpiderMan(object):
def __init__(self):
self.manager = Urlmanager()
self.downloader=HtmlDownloader()
self.parser=HtmlParser()
self.output=DataOutput()
def crawl(self):
url1 = 'https://baike.baidu.com/item/'
headers = {
# 'wd':key_word,
# 'Host': 'https://baike.baidu.com/item/',
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36"
}
with open('./中医去重1.txt', encoding='utf-8') as f:
data = f.readlines()
print("总的词条数为:", len(data), '*' * 20)
sum = len(data)
# data = data[:10]
# data = ['惊悸']
for index, line in tqdm(enumerate(data)):
name = str(line).strip('\n')
key_word = str(name)
key_word = urllib.parse.quote(key_word, encoding='utf-8', errors='replace')
# self.manager.add_new_url(url1+key_word)
new_url = url1+key_word
# html = requests.get(url1 + key_word, headers=headers)
# print(url1 + key_word) # 验证链接是否正确
# html.encoding = html.apparent_encoding
# fo = open(f"./test/{name}.txt", 'wb') # 爬取百度百科的内容保存到本地中
# fo.write((html.content))
# 判断URL管理器中是否有新的URL,同是判断抓取了多少各url
# while(self.manager.has_new_url() and self.manager.old_url_size()<10):
try:
# 从URL管理器中获取新的URL
# new_url=self.manager.get_new_url()
# HTML下载器下载网页
html = self.downloader.download(new_url)
# HTML解析器抽取数据
data = self.parser.parser(new_url, html)
# 将抽取的URL添加到URL管理器中
# self.manager.add_new_urls(new_urls)
# 数据存储器存储数据
self.output.store_data(data)
print('已经抓取{}个链接:'.format(index+1) , name,"。总的词条数为:", sum)
except Exception:
print('crawl failed')
time.sleep(random.randint(0,3))
# 数据存储器将数据存储为指定的格式
self.output.output_html()
if __name__=="__main__":
spider_man=SpiderMan()
spider_man.crawl()
三、 去除词库中已经爬取到的词语
# coding:utf-8
from tqdm import *
import json
import os
# 百度百科医疗词条数据
vocabularyentry = []
def write(filename, data):
with open(filename, 'w', encoding='utf-8') as f:
for line in tqdm(data):
f.write(str(line)+'\n')
def read(filename, encoding):
with open(filename, 'r', encoding=encoding) as f:
data = f.readlines()
print(filename, "总的词条数为:", len(data), '*' * 20)
return data
def remove_duplicate(filename):
data = read(filename, encoding='utf-8')
for line in data:
if line not in vocabularyentry:
vocabularyentry.append(line.strip('\n'))
if __name__ == '__main__':
# 所有医疗词条数据.txt所在文件夹路径
in_path = "./医疗词条数据"
fin = [fname for fname in os.listdir(in_path) if fname[-4:] == ".txt"]
for f in fin:
f = os.path.join(in_path, f)
remove_duplicate(f)
# 最终词条数据保存的文件路径
out_path = './vocabularyentry.txt'
write(out_path, vocabularyentry)
print(out_path, "最终词条数为:", len(vocabularyentry), '*' * 20)
四、 对爬取到的最终词条数据去重
# coding:utf-8
from tqdm import *
import json
import os
# 百度百科医疗词条数据
vocabularyentry = []
def write(filename, data):
with open(filename, 'w', encoding='utf-8') as f:
for line in tqdm(data):
f.write(str(line)+'\n')
def read(filename, encoding):
with open(filename, 'r', encoding=encoding) as f:
data = f.readlines()
print(filename, "总的词条数为:", len(data), '*' * 20)
return data
def remove_duplicate(filename):
data = read(filename, encoding='utf-8')
for line in data:
if line not in vocabularyentry:
vocabularyentry.append(line.strip('\n'))
if __name__ == '__main__':
# 所有医疗词条数据.txt所在文件夹路径
in_path = "./医疗词条数据"
fin = [fname for fname in os.listdir(in_path) if fname[-4:] == ".txt"]
for f in fin:
f = os.path.join(in_path, f)
remove_duplicate(f)
# 最终词条数据保存的文件路径
out_path = './vocabularyentry.txt'
write(out_path, vocabularyentry)
print(out_path, "最终词条数为:", len(vocabularyentry), '*' * 20)