某农业大学信息搜索与引擎-第一次实验_信息检索与搜索引擎学什么华中农业大学,-CSDN博客

本文链接：https://blog.csdn.net/qssssss79/article/details/131293241

【实验内容】

一、网络爬虫实现的基本原理

1. Web 服务器连接器

◆ 熟悉 requests/response的使用（实验）

import requests
r = requests.get("http://www.hzau.edu.cn/", headers={
    'User-Agent': 'Mozilla/5.0'}, timeout=10)
r.encoding = "utf-8"
print(r.text)

◆熟悉DNS缓存（实验）

import dns.resolver
import numpy as np

'''
a = dns.resolver.resolve("www.hzau.edu.cn", "A")
# A 表示将主机名转换为 IP 地址
ip = a.response.answer[0].to_text().split(" ")[-1]
# 获得相应的 IP 地址
print("华中农业大学主页的 IP 地址：" + ip)



a = dns.resolver.resolve("www.hzau.edu.cn", "A")
an = a.response.answer[0]
ip = []
ip.append(an.to_text().split(" ")[0]+" "+an.to_text().split(" ")[-1])
print(ip)

'''

a = dns.resolver.resolve("www.hzau.edu.cn", "A")
an = a.response.answer[0]
ip = []
ip.append(an.to_text().split(" ")[0]+" , "+an.to_text().split(" ")[-1])
print(np.array(ip))

思考：利用列表进行存储，既保存了原始的链接，又保留了替换后的IP地址，实现了列表维护。而使用列表维护时，需要用到numpy库，使用numpy.array()，将列表转变为数组存储。

◆ Robots 文件解析（实验）

import urllib.robotparser
import requests

# 读取 robots.txt 文件
rp = urllib.robotparser.RobotFileParser()
rp.set_url("https://item.taobao.com/robots.txt")
rp.read()

# 模拟 Googlebot
useragent = 'Baiduspider'
url = 'https://item.taobao.com/item.htm?spm=a310p.7395781.1998038982.1&id=16041384170'
if rp.can_fetch(useragent, url):
    print("允许抓取")
    file = requests.get(url)
    data = file.content # 读取全部
    fb = open("bd-html", "wb") # 将爬取的网页保存在本地
    fb.write(data)
    fb.close()
else:
    print("不允许抓取")

思考：只需要直接改变即可。

◆ 错误和异常的处理（实验）

import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
url = 'http://www.kd008.com/server.php?sid=1'
try:
    req = requests.get(url, timeout=5)
    print(req.status_code)
except ReadTimeout:
    # 超时异常
    print('Timeout')
    # 需要把当前的 url 放到任务中，过一段时间再尝试连接
except ConnectionError:
    # 连接异常
    print('Connection error')
except RequestException:
    # 请求异常
    print('Error')
else:
    if req.status_code == 200:
        print('访问正常！')
    # 将爬取的网页 req.text 保存在本地
    fb = open("t.html", "wb")
    fb.write(req.content)
    fb.close()
    if req.status_code == 404:
        print('页面不存在！')
    # 把当前的 url 从爬虫任务中删除掉
    if req.status_code == 403:
        print('页面禁止访问！')

2.超链接（URL）提取和过滤

◆ URL 提取方法（实验）

import re
s = '''<li><a href="http://news.sina.com.cn/o/2018-11-06/a75.shtml" 
target="_blank">进博会</a></li>
<li><a href="http://news.sina.com.cn/o/2018-11-06/a76.shtml" 
target="_blank">大数据</a></li>
<li><a href="/o/2018-11-06/a75.shtml" target="_blank">进博会</a></li>'''

urls = re.findall('<a href="[a-zA-Z0-9/\.\-:]+', s)
# urls = re.findall('<a href="http://[a-zA-Z0-9/\.\-:]+"', s)
print(urls)
for url in urls:
    print(url[9:len(url)-1])

思考：由于存在http开头的正则表达式只能搜索http开头的超链接，即只需将其http删除，即搜索所有开头的超链接。

3.爬行策略搜索

思考：

按照数据结构和算法课上讲的一样，dfs使用列表即可，而dfs则用队列。由于离子节点越远，链接的亲属性越低，故我们应该将其的层数限制，保证在前三层或者前两层中爬取想要的链接。

import re
import requests
from requests.exceptions import ReadTimeout, ConnectionError, RequestException
import urllib3

# 使用requests访问https时会有SSL验证，需要在get方法时关闭验证，同时会显示警告
# 消除警告
urllib3.disable_warnings()
visited = []  # 已经访问过（爬取过）的url
unvisited = []  # 已经取出的、但是还没有访问的url
url_count = 0  # 已访问过的url数量
END_COUNT = 50  # 总共url数量
end_flag = False  # 标志结束


# 访问页面：获取指定页面中含有的url
def visit(url, depth):
    visited.append(url)  # 将该链接置为访问过
    try:
        req = requests.get(url, verify=False, timeout=5)
        # print(req.status_code)
        # verify参数：关闭SSL验证
    except ReadTimeout:  # 超时异常
        print('Timeout: ', url)
        # 需要把当前的 url 放到任务中，过一段时间再尝试连接
    except ConnectionError:  # 连接异常
        print('Connection error: ', url)
    except RequestException:  # 请求异常
        print('Error: ', url)
    else:
        if req.status_code == 404:
            print('404页面不存在: ', url)
        if req.status_code == 403:
            print('403页面禁止访问: ', url)
        if req.status_code == 200:
            # 如果正确访问，count+1；判断是否结束
            global url_count
            global end_flag
            url_count += 1
            if url_count >= END_COUNT:
                end_flag = True
            print("\t" * depth, "#%d-%d %s" % (depth, url_count, url))
            PATTERN_URl = "<a.*href=\"(https?://.*?)[\"|\'].*"
            ulist = re.findall(PATTERN_URl, req.text)
            return ulist
    return None


def dfs(url, depth=1):
    ulist = visit(url, depth)
    if ulist:
        ulist = list(set(ulist) - set(visited))
        # ulist是局部变量，指的是一个节点的所有子节点
        for url in ulist:
            if depth < 3 and not end_flag:
                dfs(url, depth + 1)


def bfs(url):
    depth = 0
    global unvisited
    unvisited.append([url, depth])
    while (unvisited):
        # unvisited每个元素是[url, depth]
        [url, depth] = unvisited.pop(0)
        if end_flag or depth >= 3:
            break
        ulist = visit(url, depth)
        if ulist:
            ulist = list(set(ulist) - set(visited))
            depth += 1
            # ulist中的url都是当前url的孩子，所以深度加一
            unvisited = unvisited + [[url, depth] for url in ulist]


if __name__ == '__main__':
    start_url = "http://www.hzau.edu.cn"
    strategy = input("输入dfs/bfs：")
    if strategy == "dfs":
        print("\t" * 0, "#%d %s" % (0, start_url))
        dfs(start_url)
    elif strategy == "bfs":
        bfs(start_url)
    else:
        print("输入格式有误，请重新输入")

4.页面内容提取

◆ html.parser 的使用（实验）

from html.parser import HTMLParser


class MyHTMLParser(HTMLParser):  # 继承 HTMLParser 类
    ctag = False
    # 当前解析的标签是否为内容所在的标签
    def handle_starttag(self, tag, attrs):
        print('begin a tag:' + tag)
        if tag == 'h1':
            for attr in attrs:
               print(attr[0])
               if attr[1] == 'center':
                  self.ctag = True
                  break
    def handle_data(self, data):
         print('handle a tag')
         if self.ctag == True:
            print("Extracted data :", data)
    def handle_endtag(self, tag):
        print('end a tag:' + tag)
        self.ctag = False


parser = MyHTMLParser()
parser.feed('<html><head><title>Test</title></head>'
            '<body><h1 align="center">Big data news</h1><h1align = "center" > '
            'AI news</h1><h1 align="right">2018.8.1</h1></body></html>')

◆ lxml 的使用（实验）

from lxml import etree

html ='<html><head><title>Test</title></head><body><h1 ' \
      'align="center">Big data news</h1><h1 align="center">AI news</h1><h1 ' \
      'align="right">2018.8.1</h1></body></html>'
content = etree.fromstring(html)
rows = content.xpath('/html/body/h1')   # 根据路径表达式获得所有符合条件的节点

for row in rows:  # 对每个节点进行处理
    t = row.xpath('./text()')[0]
    print(t)

# 对数据表格进行提取
html = '''<html><head><title>Test</title></head><body><table 
id="table1"cellspacing="0px"><tr><th>学号</th><th>姓名</th><th>成绩
</th></tr>
<tr><td>1001</td><td>曾平</td><td>90</td></tr>
<tr><td>1002</td><td>王一</td><td>92</td></tr>
<tr><td>1003</td><td>张三</td><td>88</td></tr>
</table></body></html>'''

content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr')[1:]

for row in rows:
    id = row.xpath('./td[1]/text()')[0]
    name = row.xpath('./td[2]/text()')[0]
    score = row.xpath('./td[3]/text()')[0]
    print(id, name, score)

# 提取最后一个记录
content = etree.HTML(html)
rows = content.xpath('//table[@id="table1"]/tr[last()]')
for row in rows:
    id = row.xpath('./td[1]/text()')[0]
    name = row.xpath('./td[2]/text()')[0]
    score = row.xpath('./td[3]/text()')[0]
    print(id, name, score)

◆ BeautifulSoup 的使用（实验）

# Tag 的使用示例：

from bs4 import BeautifulSoup
soup = BeautifulSoup('<b class = "boldset">Extremely bold</b>','xml')
# 获得标签 b
tag = soup.b
print(type(tag))
print(tag['class'])
print(tag.string)

# 示例代码：基于 BeautifulSoup 的信息提取
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup
html = '''
<html><body><div id="second-title">访华前 这个国家的总理说“感谢中国体谅”</div>
<div class="date-source"><span class="date">2019 年 03 月 27 日 21:30</span></div>
<span class="publish source">参考消息</span><div class="article">
<p>原标题：锐参考 | 访华前，这个国家的总理说：“感谢中国体谅！”</p><p>“非常感谢中国的理解！”</p>
<p>在 25 日的新闻发布会上，新西兰总理杰辛达·阿德恩这样说道。</p></div></body></html>
'''

soup = BeautifulSoup(html, 'lxml')
# id 名前加#
title = soup.select('div#second-title')[0].text
# 类名(class)前加点
date = soup.select('span.date')[0].text
# 类名中的空格用点替换，即 publish.source
source = soup.select('span.publish.source')[0].text
# 子标签通过 > 定义
content = soup.select('div.article > p')
contentstr = ''
for i in range(len(content)):
    contentstr += content[i].text+"\n"
print("标题：", title)
print("发布日期：", date)
print("消息来源：", source)
print("消息内容：", contentstr)

二、主题爬虫的实现

特定新闻主题爬虫案例

按照所给出的相关度计算方法计算：

# 示例代码：基于 BeautifulSoup 的新闻主题采集

import urllib.robotparser
import requests
from bs4 import BeautifulSoup
import jieba
from gensim.corpora.dictionary import Dictionary
import os
import re


# 保存文件
def savefile(file_dir, content, seq):
    file_path = file_dir + os.sep + str(seq)+'.html'
    f = open(file_path, "wb")
    f.write(content.encode("utf-8"))  # 编码成字节
    f.close()


# 设置 http 头部属性
useragent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:57.0)Gecko/20100101 Firefox/57.0'
http_headers = {
   'User-Agent': useragent,
   'Accept': 'text/html'
}

# 使用关键词集合方式来定义
topicwords = {"网络", "安全", "法案", "预警", "设施", "互联网"}
website = 'http://roll.news.sina.com.cn/'
url = 'http://roll.news.sina.com.cn/news/gnxw/gdxw1/index.shtml'
file_dir = 'd:\\'  # 保存文件的地址

rp = urllib.robotparser.RobotFileParser()
rp.set_url(website + "robots.txt")
rp.read()


# 确保 Robots 中许可访问
if rp.can_fetch(useragent, url):
    page = requests.get(url, headers=http_headers)
    page.encoding = 'gb2312'
    content = page.text

    # 装载停用词列表
    stoplist = open('stopword.txt', 'r', encoding="utf-8").readlines()
    stoplist = set(w.strip() for w in stoplist)

    # 提取形如href="http://news.sina.com.cn/o/2018-11-06/doc-ihmutuea7351575.shtml"的字符串
    ulist = re.findall('href="http://[a-z0-9/.\-]+\.shtml', content)
    i = 1
    for u in ulist:
        u = u[6:]
        print(u)
        page = requests.get(u, headers=http_headers)
        page.encoding = 'utf-8'
        content = page.text

        bs = BeautifulSoup(content, 'lxml')
        ps = bs.select('div#article > p')
        ptext = ''
        doc = []
        for p in ps:
            p = p.text.strip("\n")
            if p != "":
                d = []
                # 词汇切分、过滤
                for w in list(jieba.cut(p, cut_all=True)):
                    if len(w) > 1 and w not in stoplist:
                        d.append(w)
                doc.append(d)
        # print(doc)
        # 特征选择，假设依据是：词汇至少出现 2 次，而且词汇出现在总文档中的比例 <= 1.0
        # 选择符合这两个条件的前 10 个词汇作为页面内容的代表
        dictionary = Dictionary(doc)
        dictionary.filter_extremes(no_below=2, no_above=1.0, keep_n=10)
        d = dict(dictionary.items())
        docwords = set(d.values())

        # 相关度计算: topicwords 和 docwords 集合的相似度
        commwords = topicwords.intersection(docwords)
        sim = len(commwords) / (len(topicwords) + len(docwords) - len(commwords))
        
        # 相关度计算:
        num = 0
        docwords = list(docwords)
        for i in range(len(docwords)):
            if docwords[i] in topicwords:
                num = num + 1
        k = len(topicwords)*num
        m = len(topicwords) ** 2 ** 1/2
        n = len(docwords) ** 2 ** 1/2
        if n == 0:
            cosx = 0.0
        else:
            cosx = k / (m*n)

        # 如果相似度满足设定的要求，则认为主题相关，可以保存到文件。
        if cosx > 0.01:
            print(docwords)
            print("cos=", cosx)
            savefile(file_dir, content, i)
            i = i + 1
        else:
            print('不允许抓取！')

        '''
        if sim > 0.1:
            print(docwords)
            print("sim=", sim)
            savefile(file_dir, content, i)
            i = i + 1
        else:
            print('不允许抓取！')
        '''

三、动态页面爬虫的实现

（1）构造带参数的 URL，利用参数传递动态请求；

url = 'https://search.jd.com/Search'
# 以字典存储查询的关键词及属性
qrydata = {
    'keyword': '互联网大数据',
    'enc': 'utf-8',
}
lt = []
for k, v in qrydata.items():
    lt.append(k + '=' + str(v))
query_string = '&'.join(lt)

url = url + '?' + query_string
print(url)

（5）构造 Cookie 携带参数，利用 HTTP 头部传递动态请求的参数；

import requests
import re

# 从浏览器的开发者模式复制 Cookie，保存到文本文件 taobao.txt
f = open(r'taobao.txt', 'r')  # 打开所保存的 cookies 内容文件
cookies = {}  # 初始化 cookies 字典变量

for line in f.read().split(';'):  # 按照字符进行划分读取
    name, value = line.strip().split('=', 1)
    cookies[name] = value  # 为字典 cookies 添加内容

r = requests.get("https://www.taobao.com/", cookies=cookies)
# print(r.text)
rs = re.findall(u'<title>.*</title>', r.text)
    # <title>淘宝网 - 淘！我喜欢 < / title >
print(rs)

（2）Ajax 的动态请求技术

import requests
import json

url = 'https://hotels.ctrip.com/hotel/beijing1'
# 以下 payload 数据来自浏览器看到的结果
# payload = {"PlatformType": "pc", "pageParameter": {"Refer": "", "UA": "Mozilla%2F5.0 % 20(Windows % 20NT % 2010.0 % 3B % 20WOW64) % 20AppleWebKit % 2F537.36 % 20(KHTML % 2C % 20like % 20Gecko) % 20Chrome % 2F55.0.2883.87 % 20Safari % 2F537.36","PageID":102002,"VID":"1590400761906.17yfiq"},"marketParameter":{"AID":0,"SID":0},"terminalParameter":{"UserID":"","CityID":0},"pcAuthCodeParamet":{"IsGetAuthCode":"true","AppID":"","Length":4}}

payloadHeader = {'content-type':'application/json'}
payload = {"PlatformType":"pc","pageParameter":{"Refer":"","UA":"Mozilla%2F5.0%20(Windows%20NT%2010.0%3B%20Win64%3B%20x64)%20AppleWebKit%2F537.36%20(KHTML%2C%20like%20Gecko)%20Chrome%2F110.0.0.0%20Safari%2F537.36%20Edg%2F110.0.1587.63","PageID":102002,"VID":"1678325627776.3f3odn"},"marketParameter":{"AID":0,"SID":0},"terminalParameter":{"UserID":"","CityID":0},"pcAuthCodeParamet":{"IsGetAuthCode":"true","AppID":"","Length":4}}

# 以 POST 方法发送 URL 请求，同时指定所携带的参数给函数参数 data
res = requests.post(url, data=json.dumps(payload), headers=payloadHeader)
res.encoding = 'utf-8'
print(res.text)

（2）模拟浏览器技术。

示例代码：模拟 header 信息

import requests
useragent = 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 MobileSafari/537.36'
http_headers = {
    'User-Agent': useragent,
    'Accept': 'text/html'
    # 其他头部属性
}
page = requests.get(url, headers=http_headers) #url 要请求的网址

示例代码：Selenium 模拟百度搜索

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time

browser = webdriver.Edge(executable_path=r'msedgedriver.exe')
browser.get('https://www.baidu.com/')
time.sleep(2)  # 延迟等待

input = browser.find_element_by_name("wd")
input.send_keys("信息检索")
input.send_keys(Keys.ENTER)

print(browser.current_url)
print(browser.page_source)
time.sleep(2)

四、深度页面爬虫的实现

import requests
from bs4 import BeautifulSoup
import traceback
import os
import urllib


# 读取出版社列表
def read_list(txt_path):
    press_list = []
    f = open(txt_path, 'r')
    for line in f.readlines():
        press_list.append(line.strip('\n'))
    return press_list


# 定位 input 标签，拼接 URL
def build_form(press_name):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Trident / 7.0; rv: 11.0) like Gecko '}
    res = requests.get('http://search.dangdang.com/advsearch', headers=header)
    res.encoding = 'GB2312'
    soup = BeautifulSoup(res.text, 'html.parser')

    # 定位 input 标签
    input_tag_name = ''
    conditions = soup.select('.box2 > .detail_condition > label')
    print('共找到%d 项基本条件,正在寻找 input 标签' % len(conditions))

    for item in conditions:
        text = item.select('span')[0].string
        if text == '出版社':
            input_tag_name = item.select('input')[0].get('name')
            print('已经找到 input 标签，name:', input_tag_name)

    # 拼接 url
    keyword = {'medium': '01',
               input_tag_name: press_name.encode('gb2312'),
               'category_path': '01.00.00.00.00.00',
               'sort_type': 'sort_pubdate_desc'
               }
    url = 'http://search.dangdang.com/?'
    url += urllib.parse.urlencode(keyword)
    print('入口地址:%s' % url)
    return url


# 抓取信息
def get_info(entry_url):
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1;Trident / 7.0; rv: 11.0) like Gecko'}
    res = requests.get(entry_url, headers=header)
    res.encoding = 'GB2312'

    # 这里用 lxml 解析会出现内容缺失
    soup = BeautifulSoup(res.text, 'html.parser')

    # 获取页数
    page_num = int(soup.select('.data > span')[1].text.strip('/'))
    print('共 %d 页待抓取， 这里只测试采集 1 页' % page_num)
    page_num = 1  # 这里只测试抓 1 页
    page_now = '&page_index='

    # 书名 价格 出版时间
    books_title = []
    books_price = []
    books_date = []
    books_comment = []
    for i in range(1, page_num + 1):
        now_url = entry_url + page_now + str(i)
        print('正在获取第%d 页,URL:%s' % (i, now_url))

        res = requests.get(now_url, headers=header)
        soup = BeautifulSoup(res.text, 'html.parser')

        # 获取书名
        tmp_books_title = soup.select('ul.bigimg > li[ddt-pit] > a')
        for book in tmp_books_title:
            books_title.append(book.get('title'))

        # 获取价格
        tmp_books_price = soup.select('ul.bigimg > li[ddt-pit] > p.price > span.search_now_price')
        for book in tmp_books_price:
            books_price.append(book.text)

        # 获取出版日期
        tmp_books_date = soup.select('ul.bigimg > li[ddt-pit] > p.search_book_author > span')
        for book in tmp_books_date[1::3]:
            books_date.append(book.text[2:])

    books_dict = {'title': books_title, 'price': books_price, 'date': books_date}
    return books_dict


# 保存数据
def save_info(file_dir, press_name, books_dict):
    res = ''
    try:
        for i in range(len(books_dict['title'])):
            res += (str(i + 1) + '.' + '书名:' + books_dict['title'][i]+ '\r\n' +
                    '价格:' + books_dict['price'][i] + '\r\n' +
                    '出版日期:' + books_dict['date'][i] + '\r\n' + '\r\n'
                    )
    except Exception as e:
        print('保存出错')
        print(e)
        traceback.print_exc()
    finally:
        file_path = file_dir + os.sep + press_name + '.txt'
        f = open(file_path, "wb")
        f.write(res.encode("utf-8"))
        f.close()
        return


# 入口
def start_spider(press_path, saved_file_dir):
    # 获取出版社列表
    press_list = read_list(press_path)
    for press_name in press_list:
        print('------ 开始抓取 %s ------' % press_name)
        press_page_url = build_form(press_name)
        books_dict = get_info(press_page_url)
        save_info(saved_file_dir, press_name, books_dict)
        print('------- 出版社: %s 抓取完毕 -------' % press_name)
    return


if __name__ == '__main__':
    # 出版社名列表所在文件路径
    press_txt_path = r'press.txt'
    # 抓取信息保存路径
    saved_file_dir = r'D:\files'  # 需要创建文件夹
    # 启动
    start_spider(press_txt_path, saved_file_dir)