#!/usr/bin/python3
import re
import bs4
import requests
from bs4 import BeautifulSoup
# 从百度百科爬取数据为三个字段,标题,简介,关联URL
# 给定初始百度词条:python,初始URL:http://baike.baidu.com/item/Python,爬取数据量为1000条
# 那就先有4个模块,URL管理器,下载器,解析器,数据展示
# 通过requests、BeautifulSoup两个库,实现下载器和解析器,通过两个集合数据类型,实现URL管理器
# URL拼接 起始url :http://baike.baidu.com
# new_urls = set()
# old_urls = set()
# 已经在old_urls不再爬取,不在添加到new_urls中并从其中返回一个URL
def url_manager(links):
if links is not None:
# 把重复的url去掉
links = links.difference(old_urls)
if links is not None:
for i in links:
new_urls.add(i)
def download_html(url):
headers = {
# 'Host': 'static.tieba.baidu.com',
'Referer': 'http://baike.baidu.com/item/Python',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36'}
# 返回下载页面
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = response.apparent_encoding
return response.text
except:
return None
pass
def analysis(page_html, one_url):
# 返回标题,简介,关联URL
# temp = title + introduction + page_url
# links = 关联URL
links = []
temp_url = 'http://baike.baidu.com'
soup = BeautifulSoup(page_html, 'html.parser')
# 获取标题
title = soup.find('dd', class_="lemmaWgt-lemmaTitle-title").find('h1').get_text()
# print(title)
# 获取简介
introduction = soup.find('div', class_="lemma-summary").get_text().replace('\nPython[1]\xa0\n(英国发音:/ˈpaɪθən/ 美国发音:/ˈpaɪθɑːn/),', '')
# print(introduction)
# 获得关联URL,只爬取简介中关联的URL
links_labl = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))
# links_text = soup.find('div', class_="lemma-summary").find_all('a', href=re.compile("^/item/"))
# for i in links_text:
# print(i.get_text())
for link in links_labl:
new_url = temp_url + link['href']
links.append(new_url)
temp = one_url + ' : ' + title + '_' + introduction
message.append(temp)
if links is not None:
links = set(links)
else:
links = None
return links
def out_data():
for i in message:
print(i)
pass
if __name__ == '__main__':
new_urls = set()
old_urls = set()
message = []
start_url = 'http://baike.baidu.com/item/Python'
# 起始页
page_html = download_html(start_url)
links = analysis(page_html, start_url)
url_manager(links)
# 起始页简介中URL
for i in range(100):
url = new_urls.pop()
try:
page_html = download_html(url)
if not page_html:
continue
urls = analysis(page_html, url)
url_manager(urls)
except:
print('爬取失败')
old_urls.add(url)
# 依次打印爬取到的值
out_data()