* 1.通过requests抓取百度词典的内容
* 2.把抓取回来的内容,通过BeautifulSoup分析并解析出纯文本
* 3.解析成功后,以txt文件方式保存在本地,下次需要时,直接从本地获取
需要安装的模块:
pip install requests
pip install beautifulsoup4
pip install lxml
pip install html5lib
#-*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
def get_dict(keyword):
folder_path = 'E:\\pic\dict\\'#字典存放目录
keyword_file = folder_path+keyword+'.txt'
if not os.path.exists(folder_path):
#没有文件夹,重新创建
os.makedirs(folder_path)
if os.path.exists(keyword_file):
#如果已存在,则从本地取,不再爬取
with open(keyword_file,'r',encoding='utf-8') as f:
return f.read()
url='http://dict.baidu.com/s'
params={'wd':keyword,'home':'pc'}
headers={'user_agent': 'Mozilla/5.0'}
with requests.get(url,headers=headers,params=params) as r:
if r.status_code != 200:
return None
text=r.text
if len(text) == 0:
return None
soup=BeautifulSoup(text,'lxml')
#找到内容所在的标签
tab_content=soup.select('div .tab-content')
length = len(tab_content)
if length == 0:
return None
#标签会有多个,取最后一个标签的内容
content=tab_content[length-1].text
if len(content) == 0:
return None
#过滤多余字符
content = content.replace('\n','').replace('查看百科','').strip()
with open(keyword_file,'w',encoding='utf-8') as f:
f.write(content)
f.flush()
return content
if __name__ == '__main__':
get_dict("python")