我写了一个Python程序,爬取了百度百科上有关妇产科的语料,把它放在了python文件同级目录 “data/产科语料.txt”, 所以要想跑通这个程序,一定要把data文件夹建立上,不然会报找不到文件错误。
我的实验环境是
python 3.5 ,第三方包:BeautifulSoup(解析html的包,可以通过相应pip命令下载获得),本程序与操作系统无关。
程序目录如下图:
程序如下:
#! /usr/bin/python
# coding=utf-8
import requests
import time
import re
from bs4 import BeautifulSoup
__http_headers__ = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Connection': 'keep-alive',
'Host': 'baike.baidu.com',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
A_HISTORY = [] # 历史标签容器,里面为已经访问的页面
A_CURRENT = ['https://baike.baidu.com/item/%E4%BA%A7%E7%A7%91'] # 当前标签容器,里面为当前访问的页面
count = 1 # 处理页面个数
def has_url(str_url):
for i in range(0, len(A_HISTORY)):
if str_url == A_HISTORY[i]:
return True
return False
def process_data(url_str, f_):
"""
获取页面数据并存入txt文件中,详细信息请看代码注释
:param f_: 文件流
:param url_str: url字符串
:return: None
"""
global count
# str__ = get_real_url(url_str)
if not has_url(url_str):
iterator = []
res = requests.get(url_str, headers=__http_headers__)
if res.status_code != '403':
res.encoding = 'utf-8'
# print(res.status_code)
html_doc = res.text
soup = BeautifulSoup(html_doc, 'lxml') # 获取文档对象
# 抓取需要的数据(除去了标签信息)
div_all = soup.find_all('div', attrs={'class': True})
for div in div_all:
if 'para' in str(div['class']).strip():
iterator.append(str(div.text).replace(' ', '').replace('\n', ''))
content = ''.join(iterator)
# 去除文本中间的空格
strinfo = re.compile(' ')
content = strinfo.sub('', content)
# 内容限制
if '妇产科' in str(content) or '就诊科室' in str(content) and '妇产科' in str(content):
# 将抓取到页面的所有内容的有效链接放入A_CURRENT中
a_all = soup.find_all('a', attrs={'href': True})
for a in a_all:
if str(a['href'].strip()).startswith('/item/%'):
A_CURRENT.append('https://baike.baidu.com' + a['href'].strip())
# 将数据写入txt文件
f_.write(content)
f_.flush()
# f_.close()
iterator.clear()
# 将url存入A_HISTORY中
A_HISTORY.append(url_str)
print('第 ' + str(count) + ' 个页面处理完成......')
count += 1
else:
print('链接不相关')
else:
print("这个url已经访问过了")
# time.sleep(5)
if __name__ == '__main__':
print("开始处理数据.....")
time.sleep(1)
print("......")
time.sleep(1)
print("发")
time.sleep(1)
print("车")
time.sleep(1)
print("了")
time.sleep(1)
print("赶")
time.sleep(1)
print("快")
time.sleep(1)
print("上")
time.sleep(1)
print("车")
time.sleep(1)
print("......")
time.sleep(1)
print("嘀嘀...嘀嘀..")
time.sleep(3)
f = open('data/产科语料.txt', 'a', encoding='utf-8')
while len(A_CURRENT) > 0:
process_data(A_CURRENT[0], f)
A_CURRENT.remove(A_CURRENT[0])
f.close()
print("什么时候到我这里啊,我尿急")