爬虫学习日记第八篇(爬取fofa某端口的协议排行及其机器数目,统计top200协议)

需求

找到最常用的200个协议
在这里插入图片描述
通过fofa搜索端口,得到协议排名前五名和对应机器的数目。
遍历端口,统计各个协议对应的机器数目(不准,但能看出个大概)

读写API

API需要会员,一天只能访问1000次。

import base64
import urllib
from time import sleep
import requests
res = {}
def onePort(j):
    text = 'port="' + str(j) + '"'
    text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
    text = urllib.parse.quote(text)

    URL = f'https://fofa.info/api/v1/search/stats?fields=protocol&qbase64={text}&email=*****&key=*****'
    r = requests.get(URL)
    response_dict = r.json()

    print("当前端口为:",j)
    print(response_dict)

    protocols=response_dict['aggs']['protocol']
    for i in protocols:
        if i['name'] in res:
            res[i['name']] = res[i['name']] + i['count']
        else:
            res[i['name']] = i['count']

    print(res)

for i in range(1,65535):
    onePort(i)
    sleep(10)

爬虫

页面动态加载,由于动态渲染的问题,有的请求返回结果为空。

单线程,未登录爬虫代码


import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
#直接添加这四行代码
from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')

failed=[]
success=[]
res = {}
def onePort(j):
    s = Service(r".\chromedriver.exe")
    driver = webdriver.Chrome(service=s,options=options)
    text = 'port="' + str(j) + '"'
    text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
    text = urllib.parse.quote(text)
    print(text)
    driver.get("https://fofa.info/result?qbase64=" + text)
    sleep(7)
    page_text = driver.page_source

    # print(page_text)

    tree = etree.HTML(page_text)

    protos = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//a/text()')
    nums = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//span/text()')

    for i in range(len(protos)):
        protos[i] = protos[i].strip(' ')
        protos[i] = protos[i].strip('\n')
        protos[i] = protos[i].strip(' ')

        nums[i] = nums[i].strip(' ')
        nums[i] = nums[i].strip('\n')
        nums[i] = nums[i].strip(' ')

        nums[i] = nums[i].replace(',', '')
        nums[i] = int(nums[i])

        if protos[i] in res:
            res[protos[i]] = res[protos[i]] + nums[i]
        else:
            res[protos[i]] = nums[i]
    print(protos)
    print(nums)
    if len(protos) == 0:
        failed.append(j)
    else:
        success.append(j)
    print("当前端口号:", j)
    print("失败列表:", failed)
    print("成功列表:", success)
    print(res)

    driver.quit()

for j in range(5000,10000):
    onePort(j)

多线程未登录代码

一定要注意多线程同时读写问题,全局变量上锁

import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
import threading

# 直接添加这四行代码
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')

failed = []
success = []
res = {}
lock = threading.Lock()  # 创建线程锁

def onePort(j):
    s = Service(r".\chromedriver.exe")
    driver = webdriver.Chrome(service=s, options=options)
    text = 'port="' + str(j) + '"'
    text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
    text = urllib.parse.quote(text)
    print(text)
    driver.get("https://fofa.info/result?qbase64=" + text)
    sleep(7)
    page_text = driver.page_source

    # print(page_text)

    tree = etree.HTML(page_text)

    protos = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//a/text()')
    nums = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//span/text()')
    with lock:  # 使用线程锁保护对res变量的读写操作
        for i in range(len(protos)):
            protos[i] = protos[i].strip(' ')
            protos[i] = protos[i].strip('\n')
            protos[i] = protos[i].strip(' ')

            nums[i] = nums[i].strip(' ')
            nums[i] = nums[i].strip('\n')
            nums[i] = nums[i].strip(' ')

            nums[i] = nums[i].replace(',', '')
            nums[i] = int(nums[i])

            if protos[i] in res:
                res[protos[i]] = res[protos[i]] + nums[i]
            else:
                res[protos[i]] = nums[i]

        print(protos)
        print(nums)
        if len(protos) == 0:
            failed.append(j)
        else:
            success.append(j)
        print("当前端口号:", j)
        print("失败列表:", failed)
        print("成功列表:", success)
        print(res)

    driver.quit()


with ThreadPoolExecutor(30) as t:
    for j in range(10000,10500):
        # 把下载任务提交给线程池
        t.submit(onePort, j)

手动登录获取cookie代码

# 填写webdriver的保存目录
s = Service(r".\chromedriver.exe")
driver= webdriver.Chrome(service=s)
# 记得写完整的url 包括http和https
driver.get('https://fofa.info')
# 程序打开网页后20秒内 “手动登陆账户”
time.sleep(20)
with open('cookies.txt','w') as f:
    # 将cookies保存为json格式
    f.write(json.dumps(driver.get_cookies()))

driver.close()

登录账号的单线程爬虫


from selenium import webdriver
import time
import json

from selenium.webdriver.chrome.service import Service

import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.chrome.options import Options
options = Options()
# options.add_argument('--headless')
# options.add_argument('--disable-gpu')
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46"')

failed=[]
success=[]
res = {}

s = Service(r".\chromedriver.exe")
driver = webdriver.Chrome(service=s, options=options)

driver.get('https://fofa.info')
# 首先清除由于浏览器打开已有的cookies
driver.delete_all_cookies()

with open('cookies.txt', 'r') as f:
    # 使用json读取cookies 注意读取的是文件 所以用load而不是loads
    cookies_list = json.load(f)

    # 将expiry类型变为int
    for cookie in cookies_list:
        # 并不是所有cookie都含有expiry 所以要用dict的get方法来获取
        if isinstance(cookie.get('expiry'), float):
            cookie['expiry'] = int(cookie['expiry'])
        driver.add_cookie(cookie)

# 重新发送请求(这步是非常必要的,要不然携带完cookie之后仍然在登录界面)
driver.get('https://fofa.info')
# sleep等待页面完全加载出来,这一步很关键
time.sleep(3)

j=2
text = 'port="' + str(j) + '"'
text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
text = urllib.parse.quote(text)
print(text)
sleep(10)
driver.get("https://fofa.info/result?qbase64=" + text)
sleep(6)
page_text = driver.page_source

print(page_text)

tree = etree.HTML(page_text)

protos = tree.xpath(
    '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//a/text()')
nums = tree.xpath(
    '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//span/text()')

for i in range(len(protos)):
    protos[i] = protos[i].strip(' ')
    protos[i] = protos[i].strip('\n')
    protos[i] = protos[i].strip(' ')

    nums[i] = nums[i].strip(' ')
    nums[i] = nums[i].strip('\n')
    nums[i] = nums[i].strip(' ')

    nums[i] = nums[i].replace(',', '')
    nums[i] = int(nums[i])

    if protos[i] in res:
        res[protos[i]] = res[protos[i]] + nums[i]
    else:
        res[protos[i]] = nums[i]
print(protos)
print(nums)
if len(protos) == 0:
    failed.append(j)
else:
    success.append(j)
print("当前端口号:", j)
print("失败列表:", failed)
print("成功列表:", success)
print(res)

driver.quit()

登录用户多线程

import threading

from selenium import webdriver
import time
import json

from selenium.webdriver.chrome.service import Service

import base64
import json
import urllib
from concurrent.futures import ThreadPoolExecutor
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from lxml import etree
from time import sleep
from selenium.webdriver.chrome.options import Options

from selenium.webdriver.chrome.options import Options
options = Options()
options.add_argument('--headless')
# options.add_argument('--disable-gpu')
options.add_argument('user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36 Edg/118.0.2088.46"')

failed=[]
success=[]
res = {}
lock = threading.Lock()

def onePort(j):
    s = Service(r".\chromedriver.exe")
    driver = webdriver.Chrome(service=s, options=options)

    driver.get('https://fofa.info')
    # 首先清除由于浏览器打开已有的cookies
    driver.delete_all_cookies()

    with open('cookies.txt', 'r') as f:
        # 使用json读取cookies 注意读取的是文件 所以用load而不是loads
        cookies_list = json.load(f)

        # 将expiry类型变为int
        for cookie in cookies_list:
            # 并不是所有cookie都含有expiry 所以要用dict的get方法来获取
            if isinstance(cookie.get('expiry'), float):
                cookie['expiry'] = int(cookie['expiry'])
            driver.add_cookie(cookie)

    # 重新发送请求(这步是非常必要的,要不然携带完cookie之后仍然在登录界面)
    driver.get('https://fofa.info')
    # sleep等待页面完全加载出来,这一步很关键
    time.sleep(3)

    text = 'port="' + str(j) + '"'
    text = base64.b64encode(text.encode("utf-8")).decode("utf-8")
    text = urllib.parse.quote(text)
    print(text)
    driver.get("https://fofa.info/result?qbase64=" + text)
    sleep(6)

    page_text = driver.page_source



    tree = etree.HTML(page_text)

    protos = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//a/text()')
    nums = tree.xpath(
        '//div[@class="hsxa-ui-component hsxa-meta-data-statistical-list hsxa-pos-rel"]/div[13]//li//span/text()')

    with lock:  # 使用线程锁保护对res变量的读写操作
        for i in range(len(protos)):
            protos[i] = protos[i].strip(' ')
            protos[i] = protos[i].strip('\n')
            protos[i] = protos[i].strip(' ')

            nums[i] = nums[i].strip(' ')
            nums[i] = nums[i].strip('\n')
            nums[i] = nums[i].strip(' ')

            nums[i] = nums[i].replace(',', '')
            nums[i] = int(nums[i])

            if protos[i] in res:
                res[protos[i]] = res[protos[i]] + nums[i]
            else:
                res[protos[i]] = nums[i]

        print(protos)
        print(nums)
        if len(protos) == 0:
            failed.append(j)
        else:
            success.append(j)
        print("当前端口号:", j)
        print("失败列表:", failed)
        print("成功列表:", success)
        print(res)
    driver.quit()


with ThreadPoolExecutor(1) as t:
    for j in range(3679,4000):
        # 把下载任务提交给线程池
        t.submit(onePort, j)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

开心星人

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值