【实验内容】
一、 网络爬虫实现的基本原理
1. Web 服务器连接器
◆ 熟悉 requests/response的使用(实验)
import requests
r = requests.get("http://www.hzau.edu.cn/", headers={
'User-Agent': 'Mozilla/5.0'}, timeout=10)
r.encoding = "utf-8"
print(r.text)
◆熟悉DNS缓存(实验)
import dns.resolver
import numpy as np
'''
a = dns.resolver.resolve("www.hzau.edu.cn", "A")
# A 表示将主机名转换为 IP 地址
ip = a.response.answer[0].to_text().split(" ")[-1]
# 获得相应的 IP 地址
print("华中农业大学主页的 IP 地址:" + ip)
a = dns.resolver.resolve("www.hzau.edu.cn", "A")
an = a.response.answer[0]
ip = []
ip.append(an.to_text().split(" ")[0]+" "+an.to_text().split(" ")[-1])
print(ip)
'''
a = dns.resolver.resolve("www.hzau.edu.cn", "A")
an = a.response.answer[0]
ip = []
ip.append(an.to_text().split(" ")[0]+" , "+an.to_text().split(" ")[-1])
print(np.array(ip))
思考:利用列表进行存储,既保存了原始的链接,又保留了替换后的IP地址,实现了列表维护。而使用列表维护时,需要用到numpy库,使用numpy.array(),将列表转变为数组存储。
◆ Robots 文件解析(实验)
import urllib.robotparser
import requests
# 读取 robots.txt 文件
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://item.taobao.com/robots.txt")
rp.read()
# 模拟 Googlebot
useragent = 'Baiduspider'
url = 'https://item.taobao.com/item.htm?spm=a310p.7395781.1998038982.1&id=16041384170'
if rp.can_fetch(useragent, url):
print("允许抓取")
file = requests.get(url)
data = file.content # 读取全部
fb = open("bd-html", "wb") # 将爬取的网页保存在本地
fb.write(data)
fb.close()
else:
print("不允许抓取")
思考:只需要直接改变即可。
◆ 错误和异常的处理(实验)
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
url = 'http://www.kd008.com/server.php?sid=1'
try:
req = requests.get(url, timeout=5)
print(req.status_code)
except ReadTimeout:
# 超时异常
print('Timeout')
# 需要把当前的 url 放到任务中,过一段时间再尝试连接
except ConnectionError:
# 连接异常
print('Connection error')
except RequestException:
# 请求异常
print('Error')
else:
if req.status_code == 200:
print('访问正常!')
# 将爬取的网页 req.text 保存在本地
fb = open("t.html", "wb")
fb.write(req.content)
fb.close()
if req.status_code == 404:
print('页面不存在!')
# 把当前的 url 从爬虫任务中删除掉
if req.status_code == 403:
print('页面禁止访问!')
2.超链接(URL)提取和过滤
◆ URL 提取方法(实验)
import re
s = '''<li><a href="http://news.sina.com.cn/o/2018-11-06/a75.shtml"
target="_blank">进博会</a></li>
<li><a href="http://news.sina.com.cn/o/2018-11-06/a76.shtml"
target="_blank">大数据</a></li>
<li><a href="/o/2018-11-06/a75.shtml" target="_blank">进博会</a></li>'''
urls = re.findall('<a href="[a-zA-Z0-9/\.\-:]+', s)
# urls = re.findall('<a href="http://[a-zA-Z0-9/\.\-:]+"', s)
print(urls)
for url in urls:
print(url[9:len(url)-1])
思考:由于存在http开头的正则表达式只能搜索http开头的超链接,即只需将其http删除,即搜索所有开头的超链接。
3.爬行策略搜索
思考:
按照数据结构和算法课上讲的一样,dfs使用列表即可,而dfs则用队列。由于离子节点越远,链接的亲属性越低,故我们应该将其的层数限制,保证在前三层或者前两层中爬取想要的链接。
import re
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
import urllib3
# 使用requests访问https时会有SSL验证,需要在get方法时关闭验证,同时会显示警告
# 消除警告
urllib3.disable_warnings()
visited = [] # 已经访问过(爬取过)的url
unvisited = [] # 已经取出的、但是还没有访问的url
url_count = 0 # 已访问过的url数量
END_COUNT = 50 # 总共url数量
end_flag = False # 标志结束
# 访问页面:获取指定页面中含有的url
def visit(url, depth):
visited.append(url) # 将该链接置为访问过
try:
req = requests.get(url, verify=False, timeout=5)
# print(req.status_code)
# verify参数:关闭SSL验证
except ReadTimeout: # 超时异常
print('Timeout: ', url)
# 需要把当前的 url 放到任务中,过一段时间再尝试连接
except ConnectionError: # 连接异常
print('Connection error: ', url)
except RequestException: # 请求异常
print('Error: ', url)
else:
if req.status_code == 404:
print('404页面不存在: ', url)
if req.status_code == 403:
print('403页面禁止访问: ', url)
if req.status_code == 200:
# 如果正确访问,count+1;判断是否结束
global url_count
global end_flag
url_count += 1
if url_count >= END_COUNT:
end_flag = True
print("\t" * depth, "#%d-%d %s" % (depth, url_count, url))
PATTERN_URl = "<a.*href=\"(https?://.*?)[\"|\'].*"
ulist = re.findall(PATTERN_URl, req.text)
return ulist
return None
def dfs(url, depth=1):
ulist = visit(url, depth)
if ulist:
ulist = list(set(ulist) - set(visited))
# ulist是局部变量,指的是一个节点的所有子节点
for url in ulist:
if depth < 3 and not end_flag:
dfs(url, depth + 1)
def bfs(url):
depth = 0
global unvisited
unvisited.append([url, depth])
while (unvisited):
# unvisited每个元素是[url, depth]
[url, depth] = unvisited.pop(0)
if end_flag or depth >= 3:
break
ulist = visit(url, depth)
if ulist:
ulist = list(set(ulist) - set(visited))
depth += 1
# ulist中的url都是当前url的孩子,所以深度加一
unvisited = unvisited + [[url, depth] for url in ulist]
if __name__ == '__main__':
start_url = "http://www.hzau.edu.cn"
strategy = input("输入dfs/bfs:")
if strategy == "dfs":
print("\t" * 0, "#%d %s" % (0, start_url))
dfs(start_url)
elif strategy == "bfs":
bfs(start_url)
else:
print("输入格式有误,请重新输入")
4.页面内容提取
◆ html.parser 的使用(实验)
from html.parser import HTMLParser
class MyHTMLParser(HTMLParser): # 继承 HTMLParser 类
ctag = False
# 当前解析的标签是否为内容所在的标签
def handle_starttag(self, tag, attrs):
print('begin a tag:' + tag)
if tag == 'h1':
for attr in attrs:
print(attr[0])
if attr[1] == 'center':
self.ctag = True
break
def handle_data(self, data):
print('handle a tag')
if self.ctag == True:
print("Extracted data :", data)
def handle_endtag(self, tag):
print('end a tag:' + tag)
self.ctag = False
parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
'<body><h1 align="center">Big data news</h1><h1align = "center" > '
'AI news</h1><h1 align="right">2018.8.1</h1></body></html>')
◆ lxml 的使用(实验)
from lxml import etree
html ='<html><head><title>Test</title></head><body><h1 ' \
'align="center">Big data news</h1><h1 align="center">AI news</h1><h1 ' \
'align="right">2018.8.1</h1></body></html>'
content = etree.fromstring(html)
rows = content.xpath('/html/body/h1') # 根据路径表达式获得所有符合条件的节点
for row in rows: # 对每个节点进行处理
t = row.xpath('./text()')[0]
print(t)
# 对数据表格进行提取
html = '''<html><head><title>Test</title></head><body><table
id="table1"cellspacing="0px"><tr><th>学号</th><th>姓名</th><th>成绩
</th></tr>
<tr><td>1001</td><td>曾平</td><td>90</td></tr>
<tr><td>1002</td><td>王一</td><td>92</td></tr>
<tr><td>1003</td><td>张三</td><td>88</td></tr>
</table></body></html>'''
content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr')[1:]
for row in rows:
id = row.xpath('./td[1]/text()')[0]
name = row.xpath('./td[2]/text()')[0]
score = row.xpath('./td[3]/text()')[0]
print(id, name, score)
# 提取最后一个记录
content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr[last()]')
for row in rows:
id = row.xpath('./td[1]/text()')[0]
name = row.xpath('./td[2]/text()')[0]
score = row.xpath('./td[3]/text()')[0]
print(id, name, score)
◆ BeautifulSoup 的使用(实验)
# Tag 的使用示例:
from bs4 import BeautifulSoup
soup = BeautifulSoup('<b class = "boldset">Extremely bold</b>','xml')
# 获得标签 b
tag = soup.b
print(type(tag))
print(tag['class'])
print(tag.string)
# 示例代码:基于 BeautifulSoup 的信息提取
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
html = '''
<html><body><div id="second-title">访华前 这个国家的总理说“感谢中国体谅”</div>
<div class="date-source"><span class="date">2019 年 03 月 27 日 21:30</span></div>
<span class="publish source">参考消息</span><div class="article">
<p>原标题:锐参考 | 访华前,这个国家的总理说:“感谢中国体谅!”</p><p>“非常感谢中国的理解!”</p>
<p>在 25 日的新闻发布会上,新西兰总理杰辛达·阿德恩这样说道。</p></div></body></html>
'''
soup = BeautifulSoup(html, 'lxml')
# id 名前加#
title = soup.select('div#second-title')[0].text
# 类名(class)前加点
date = soup.select('span.date')[0].text
# 类名中的空格用点替换,即 publish.source
source = soup.select('span.publish.source')[0].text
# 子标签通过 > 定义
content = soup.select('div.article > p')
contentstr = ''
for i in range(len(content)):
contentstr += content[i].text+"\n"
print("标题:", title)
print("发布日期:", date)
print("消息来源:", source)
print("消息内容:", contentstr)
二、 主题爬虫的实现
特定新闻主题爬虫案例
按照所给出的相关度计算方法计算:
# 示例代码:基于 BeautifulSoup 的新闻主题采集
import urllib.robotparser
import requests
from bs4 import BeautifulSoup
import jieba
from gensim.corpora.dictionary import Dictionary
import os
import re
# 保存文件
def savefile(file_dir, content, seq):
file_path = file_dir + os.sep + str(seq)+'.html'
f = open(file_path, "wb")
f.write(content.encode("utf-8")) # 编码成字节
f.close()
# 设置 http 头部属性
useragent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0)Gecko/20100101 Firefox/57.0'
http_headers = {
'User-Agent': useragent,
'Accept': 'text/html'
}
# 使用关键词集合方式来定义
topicwords = {"网络", "安全", "法案", "预警", "设施", "互联网"}
website = 'http://roll.news.sina.com.cn/'
url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml'
file_dir = 'd:\\' # 保存文件的地址
rp = urllib.robotparser.RobotFileParser()
rp.set_url(website + "robots.txt")
rp.read()
# 确保 Robots 中许可访问
if rp.can_fetch(useragent, url):
page = requests.get(url, headers=http_headers)
page.encoding = 'gb2312'
content = page.text
# 装载停用词列表
stoplist = open('stopword.txt', 'r', encoding="utf-8").readlines()
stoplist = set(w.strip() for w in stoplist)
# 提取形如href="http://news.sina.com.cn/o/2018-11-06/doc-ihmutuea7351575.shtml"的字符串
ulist = re.findall('href="http://[a-z0-9/.\-]+\.shtml', content)
i = 1
for u in ulist:
u = u[6:]
print(u)
page = requests.get(u, headers=http_headers)
page.encoding = 'utf-8'
content = page.text
bs = BeautifulSoup(content, 'lxml')
ps = bs.select('div#article > p')
ptext = ''
doc = []
for p in ps:
p = p.text.strip("\n")
if p != "":
d = []
# 词汇切分、过滤
for w in list(jieba.cut(p, cut_all=True)):
if len(w) > 1 and w not in stoplist:
d.append(w)
doc.append(d)
# print(doc)
# 特征选择,假设依据是:词汇至少出现 2 次,而且词汇出现在总文档中的比例 <= 1.0
# 选择符合这两个条件的前 10 个词汇作为页面内容的代表
dictionary = Dictionary(doc)
dictionary.filter_extremes(no_below=2, no_above=1.0, keep_n=10)
d = dict(dictionary.items())
docwords = set(d.values())
# 相关度计算: topicwords 和 docwords 集合的相似度
commwords = topicwords.intersection(docwords)
sim = len(commwords) / (len(topicwords) + len(docwords) - len(commwords))
# 相关度计算:
num = 0
docwords = list(docwords)
for i in range(len(docwords)):
if docwords[i] in topicwords:
num = num + 1
k = len(topicwords)*num
m = len(topicwords) ** 2 ** 1/2
n = len(docwords) ** 2 ** 1/2
if n == 0:
cosx = 0.0
else:
cosx = k / (m*n)
# 如果相似度满足设定的要求,则认为主题相关,可以保存到文件。
if cosx > 0.01:
print(docwords)
print("cos=", cosx)
savefile(file_dir, content, i)
i = i + 1
else:
print('不允许抓取!')
'''
if sim > 0.1:
print(docwords)
print("sim=", sim)
savefile(file_dir, content, i)
i = i + 1
else:
print('不允许抓取!')
'''
三、 动态页面爬虫的实现
(1) 构造带参数的 URL,利用参数传递动态请求;
url = 'https://search.jd.com/Search'
# 以字典存储查询的关键词及属性
qrydata = {
'keyword': '互联网大数据',
'enc': 'utf-8',
}
lt = []
for k, v in qrydata.items():
lt.append(k + '=' + str(v))
query_string = '&'.join(lt)
url = url + '?' + query_string
print(url)
(5)构造 Cookie 携带参数,利用 HTTP 头部传递动态请求的参数;
import requests
import re
# 从浏览器的开发者模式复制 Cookie,保存到文本文件 taobao.txt
f = open(r'taobao.txt', 'r') # 打开所保存的 cookies 内容文件
cookies = {} # 初始化 cookies 字典变量
for line in f.read().split(';'): # 按照字符进行划分读取
name, value = line.strip().split('=', 1)
cookies[name] = value # 为字典 cookies 添加内容
r = requests.get("https://www.taobao.com/", cookies=cookies)
# print(r.text)
rs = re.findall(u'<title>.*</title>', r.text)
# <title>淘宝网 - 淘!我喜欢 < / title >
print(rs)
(2)Ajax 的动态请求技术
import requests
import json
url = 'https://hotels.ctrip.com/hotel/beijing1'
# 以下 payload 数据来自浏览器看到的结果
# payload = {"PlatformType": "pc", "pageParameter": {"Refer": "", "UA": "Mozilla%2F5.0 % 20(Windows % 20NT % 2010.0 % 3B % 20WOW64) % 20AppleWebKit % 2F537.36 % 20(KHTML % 2C % 20like % 20Gecko) % 20Chrome % 2F55.0.2883.87 % 20Safari % 2F537.36","PageID":102002,"VID":"1590400761906.17yfiq"},"marketParameter":{"AID":0,"SID":0},"terminalParameter":{"UserID":"","CityID":0},"pcAuthCodeParamet":{"IsGetAuthCode":"true","AppID":"","Length":4}}
payloadHeader = {'content-type':'application/json'}
payload = {"PlatformType":"pc","pageParameter":{"Refer":"","UA":"Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F110.0.0.0%20Safari%2F537.36%20Edg%2F110.0.1587.63","PageID":102002,"VID":"1678325627776.3f3odn"},"marketParameter":{"AID":0,"SID":0},"terminalParameter":{"UserID":"","CityID":0},"pcAuthCodeParamet":{"IsGetAuthCode":"true","AppID":"","Length":4}}
# 以 POST 方法发送 URL 请求,同时指定所携带的参数给函数参数 data
res = requests.post(url, data=json.dumps(payload), headers=payloadHeader)
res.encoding = 'utf-8'
print(res.text)
(2)模拟浏览器技术。
示例代码:模拟 header 信息
import requests
useragent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 MobileSafari/537.36'
http_headers = {
'User-Agent': useragent,
'Accept': 'text/html'
# 其他头部属性
}
page = requests.get(url, headers=http_headers) #url 要请求的网址
示例代码:Selenium 模拟百度搜索
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
browser = webdriver.Edge(executable_path=r'msedgedriver.exe')
browser.get('https://www.baidu.com/')
time.sleep(2) # 延迟等待
input = browser.find_element_by_name("wd")
input.send_keys("信息检索")
input.send_keys(Keys.ENTER)
print(browser.current_url)
print(browser.page_source)
time.sleep(2)
四、深度页面爬虫的实现
import requests
from bs4 import BeautifulSoup
import traceback
import os
import urllib
# 读取出版社列表
def read_list(txt_path):
press_list = []
f = open(txt_path, 'r')
for line in f.readlines():
press_list.append(line.strip('\n'))
return press_list
# 定位 input 标签,拼接 URL
def build_form(press_name):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Trident / 7.0; rv: 11.0) like Gecko '}
res = requests.get('http://search.dangdang.com/advsearch', headers=header)
res.encoding = 'GB2312'
soup = BeautifulSoup(res.text, 'html.parser')
# 定位 input 标签
input_tag_name = ''
conditions = soup.select('.box2 > .detail_condition > label')
print('共找到%d 项基本条件,正在寻找 input 标签' % len(conditions))
for item in conditions:
text = item.select('span')[0].string
if text == '出版社':
input_tag_name = item.select('input')[0].get('name')
print('已经找到 input 标签,name:', input_tag_name)
# 拼接 url
keyword = {'medium': '01',
input_tag_name: press_name.encode('gb2312'),
'category_path': '01.00.00.00.00.00',
'sort_type': 'sort_pubdate_desc'
}
url = 'http://search.dangdang.com/?'
url += urllib.parse.urlencode(keyword)
print('入口地址:%s' % url)
return url
# 抓取信息
def get_info(entry_url):
header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Trident / 7.0; rv: 11.0) like Gecko'}
res = requests.get(entry_url, headers=header)
res.encoding = 'GB2312'
# 这里用 lxml 解析会出现内容缺失
soup = BeautifulSoup(res.text, 'html.parser')
# 获取页数
page_num = int(soup.select('.data > span')[1].text.strip('/'))
print('共 %d 页待抓取, 这里只测试采集 1 页' % page_num)
page_num = 1 # 这里只测试抓 1 页
page_now = '&page_index='
# 书名 价格 出版时间
books_title = []
books_price = []
books_date = []
books_comment = []
for i in range(1, page_num + 1):
now_url = entry_url + page_now + str(i)
print('正在获取第%d 页,URL:%s' % (i, now_url))
res = requests.get(now_url, headers=header)
soup = BeautifulSoup(res.text, 'html.parser')
# 获取书名
tmp_books_title = soup.select('ul.bigimg > li[ddt-pit] > a')
for book in tmp_books_title:
books_title.append(book.get('title'))
# 获取价格
tmp_books_price = soup.select('ul.bigimg > li[ddt-pit] > p.price > span.search_now_price')
for book in tmp_books_price:
books_price.append(book.text)
# 获取出版日期
tmp_books_date = soup.select('ul.bigimg > li[ddt-pit] > p.search_book_author > span')
for book in tmp_books_date[1::3]:
books_date.append(book.text[2:])
books_dict = {'title': books_title, 'price': books_price, 'date': books_date}
return books_dict
# 保存数据
def save_info(file_dir, press_name, books_dict):
res = ''
try:
for i in range(len(books_dict['title'])):
res += (str(i + 1) + '.' + '书名:' + books_dict['title'][i]+ '\r\n' +
'价格:' + books_dict['price'][i] + '\r\n' +
'出版日期:' + books_dict['date'][i] + '\r\n' + '\r\n'
)
except Exception as e:
print('保存出错')
print(e)
traceback.print_exc()
finally:
file_path = file_dir + os.sep + press_name + '.txt'
f = open(file_path, "wb")
f.write(res.encode("utf-8"))
f.close()
return
# 入口
def start_spider(press_path, saved_file_dir):
# 获取出版社列表
press_list = read_list(press_path)
for press_name in press_list:
print('------ 开始抓取 %s ------' % press_name)
press_page_url = build_form(press_name)
books_dict = get_info(press_page_url)
save_info(saved_file_dir, press_name, books_dict)
print('------- 出版社: %s 抓取完毕 -------' % press_name)
return
if __name__ == '__main__':
# 出版社名列表所在文件路径
press_txt_path = r'press.txt'
# 抓取信息保存路径
saved_file_dir = r'D:\files' # 需要创建文件夹
# 启动
start_spider(press_txt_path, saved_file_dir)