Python爬虫实例

Captain_RB

已于 2022-03-23 10:07:29 修改

阅读量965

点赞数

分类专栏： Python 文章标签：爬虫 python

于 2021-05-16 20:00:50 首次发布

本文链接：https://blog.csdn.net/Captain_RB/article/details/116764597

版权

Python 专栏收录该内容

6 篇文章 1 订阅

订阅专栏

本文介绍了Python爬虫的两个应用实例：一是爬取美图网站各分类前5页的图片；二是探讨了如何使用Fofa进行信息搜集，包括常规方式和利用API进行数据提取。对于Fofa，强调了在常规获取信息时需要注意登录凭证（cookie）的有效性，而使用API则提供了更高效的数据提取方式。

摘要由CSDN通过智能技术生成

文章目录

1.爬取网站图片

爬取一个美图网，按照网站主页导航栏分类，每个类别仅爬取前5页内容，代码如下：

import os, requests, re
from bs4 import BeautifulSoup

# 携带cookie作为登录凭证
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"}

# 版权问题无法使用真实网址
url = "https://www.xxxx.net"


def get_imagename(image_url):

    return re.search(r"/([0-9-a-zA-Z]*\.jpg)$", image_url).group(1)


def save_images(pathname, page_url):

    if os.listdir().count(pathname) == 0:
        os.mkdir(pathname)

    # 访问页码链接
    res = requests.get(page_url)
    soup = BeautifulSoup(res.content.decode("gbk"), "html.parser")

    # 将图片逐一存储到新建目录
    for i in soup.find_all("img"):
        image_url = i["src"]
        image_name = get_imagename(image_url)
        with open(pathname + '\\' + image_name, "wb") as f:
            f.write(requests.get(image_url).content)
            f.close()


def get_category_images(tag):

    category_url = tag["href"]
    category = tag["title"]

    # 访问导航栏
    res = requests.get(category_url, headers=headers)
    soup = BeautifulSoup(res.content.decode("gbk"), "html.parser")

    # 获取图片页码链接列表，设定最多只获取5页
    page_num = 0
    pages = []
    for i in soup.find_all("a", target="_self"):
        prefix = re.match(r"^https://www.xxxx.net/r/\d/", category_url).group()
        pages.append(prefix + i["href"])
        page_num += 1
        if page_num == 5:
            break

    # 创建导航栏类别命名的文件夹，将每页图片存储
    for i in pages:
        save_images(category, i)


def get_images():

    # 访问主页
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content.decode("gbk"), "html.parser")

    # 获取导航栏分类标签列表，不包括“主页”
    category = soup.find_all("a", class_="MainNav", title=re.compile(".*"))
    # 获取并存储导航栏类别链接中的图片
    for i in category:
        get_category_images(i)


if __name__ == "__main__":
    get_images()

运行结果：
在这里插入图片描述

2.Fofa信息搜集

对于提供API的网站，可以参考其API文档方便快速获取信息，但对于没有提供API的网站，或者发现通过API提取的信息并不完整，或者需要付费，这时仍需要通过分析请求数据的流程，利用常规爬虫搜集信息。这里以Fofa为例，介绍两种爬取信息的方式：

(1) 常规获取

首先在浏览器利用开发工具跟踪网页请求和回复，发现登录界面需要验证码，为方便起见登录后记录cookie值作为登录凭证使用，注意cookie有效时间，使用爬虫时最好重新登录使用最新的cookie，代码如下：

import base64, requests, time, random
from bs4 import BeautifulSoup

# 携带cookie作为登录凭证
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0",
           "Cookie": "befor_router=%2F; " \
                     "fofa_token=xxxxxx" \
                     "refresh_token=xxxxxx"
          }

# Fofa查询关键字
query = "domain=\"baidu.com\""

# 根据请求规则，qbase64为查询关键字的base64编码
# base64.b64encode编码后返回字节，需要编码转换为字符串
query_encode = query.encode("utf-8")
url = "https://fofa.so/result?qbase64=" + str(base64.b64encode(query_encode), "utf8")

# 定义列数不匹配异常
# 爬虫只能针对站点信息进行扫描，如果查询资产的IP或域名链接不能访问则会报错
MatchError = Exception("url_list does not match the number of columns ip_list, replace the query keyword... ")

# 随机延时
def time_sleep():
    random_time = random.uniform(3.0, 10.0)
    time.sleep(random_time)

# 获取当前页面(url,ip)列表
def fetch_url(soup):
    # (url,ip)列表
    target_list = []

    # 获取包含url的标签对象列表、
    # url列表最后一个为无关对象:粤ICP备16088626号，需要将其移除
    url_list = soup.find_all("a", target="_blank")
    url_list.remove(soup.find_all("a", target="_blank").pop())

    # 获取包含ip的标签对象列表
    div_list = soup.find_all("div", class_ = "contentLeft")
    ip_list = []
    for i in div_list:
        ip_list.append(i.find("a"))

    # 验证url与ip的标签对象列表一一对应
    if len(url_list) == len(ip_list):
        # 遍历获取(url,ip)列表
        for i in range(len(url_list)):
            target = (url_list[i].contents[0], ip_list[i].string)
            target_list.append(target)
        return target_list

    else:
        raise MatchError


def search(query):
    # (url,ip)列表
    target_list = []
    
    # 发送查询请求
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.content.decode(), "html.parser")

    # 查询条目总数
    # 条目数可能是1,533形式，需要将','去除
    item_total_num_str = soup.find("span", class_ = "pSpanColor").string
    item_total_num =int(item_total_num_str.replace(',', ''))
    pages_list = soup.find_all("li", class_=["number active", "number"])

    # 查询页码总数
    pages_total_num = int(pages_list.pop().string)

    if item_total_num == 0:
        print("Reasults:0\n")
        return
    
    # 因为Fofa普通注册用户只能查询50条数据，所以需要判断条目总数
    elif item_total_num <= 50:
        item_query_num = item_total_num
        for i in range(pages_total_num):
            res = requests.get(url + "&page=%d&page_size=10"%(i+1), headers=headers)
            soup = BeautifulSoup(res.content.decode(), "html.parser")
            
            # 循环获取每页的查询结果
            for j in fetch_url(soup):
                target_list.append(j)

            # 延时，避免频繁请求
            time_sleep()

    else:
        item_query_num = 50
        for i in range(5):
            res = requests.get(url + "&page=%d&page_size=10"%(i+1), headers=headers)
            soup = BeautifulSoup(res.content.decode(), "html.parser")
            
            # 循环获取每页的查询结果
            for j in fetch_url(soup):
                target_list.append(j)

            # 延时，避免频繁请求
            time_sleep()

    print("Total item num:%d" % item_total_num)
    print("Query item num:%d" % item_query_num)
    for i in target_list:
        print(i[0], i[1])


if __name__ == "__main__":
    search(query)

运行结果：
在这里插入图片描述

(2) API 获取

使用Fofa API需要开通会员或充值，使用方法参考 Fofa API文档，思路与常规爬虫相同，而且因为返回的是JSON数据，提取数据更加简单高效，代码如下：

import base64, requests, json

headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:88.0) Gecko/20100101 Firefox/88.0"}

# Fofa查询关键字
query = "domain=\"baidu.com\""

# Fofa API
api = "https://fofa.so/api/v1/search/all"

# 根据请求规则，qbase64为查询关键字的base64编码
# base64.b64encode编码后返回字节，需要编码转换为字符串
query_encode = query.encode("utf-8")
query_qbase64 = str(base64.b64encode(query_encode), "utf8")

params = {
		  # 注册和登陆时填写的email
          "email": "xxxxxx@xxxxxx",
          # 会员查看个人资料页面可得到key，为32位hash值
          "key": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
          # 经过base64加密的查询字符串
          "qbase64": query_qbase64,
          # 翻页数，默认为第一页
          "page": 1,
          # 每次查询返回记录数，默认100条
          "size": 20,
          # 可选查询字段，默认为host，可选参数包括：
          # title, ip, domain, port, country, province, city, country_name, header, server, protocol 
          # banner, cert, isp, as_number, as_organization, latitude, longitude, structinfo, icp
          "fields":"host,ip,country"
         }


def search(query):

    # 发送查询请求
    res = requests.get(api, headers=headers, params=params)

    # 返回JSON数据格式：
    # {
    #   "mode": "extended",
    #   "error": false,
    #   "query": "domain=\"nosec.org\"\n",
    #   "page": 1,
    #   "size": 6,
    #   "results": [
    #     ["https://i.nosec.org"],["https://nosec.org"],
    #   ]
    # }
    query_results = json.loads(res.content.decode())
    
    for i in query_results["results"]:
        print(i)


if __name__ == "__main__":
    search(query)