python3工具，批量多线程检测url的状态码、title、跳转后的页面

doulicau

已于 2022-05-12 12:55:30 修改

阅读量2.9k

点赞数 7

分类专栏： python 文章标签： python 爬虫开发语言

于 2020-12-05 18:20:43 首次发布

本文链接：https://blog.csdn.net/doulicau/article/details/110704894

版权

python 专栏收录该内容

2 篇文章 2 订阅

订阅专栏

此脚本配合这篇文章中的脚本使用，可以批量测试url的状态码、title、跳转后的页面，结果保存到result.csv中。

python3 ip:port格式转化成http、https格式文件

python3 ip:port格式转化成http、https格式文件_doulicau的博客-CSDN博客

使用方法：

测试的url放到url.txt中，然后：

python3 gettitlecode.py

代码如下：

# -*- coding:utf-8 -*-
import re
import requests
import urllib3
import logging
from concurrent.futures import ThreadPoolExecutor
import time
import threading
from requests.packages.urllib3.exceptions import InsecureRequestWarning
import xlwt
import xlrd
from xlutils.copy import copy
logging.captureWarnings(True)
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

start = time.time()
lock = threading.Lock()
savefilename = time.strftime("%Y-%m-%d %H.%M.%S")
myxls=xlwt.Workbook()
sheet1=myxls.add_sheet(u'title',cell_overwrite_ok=True)
sheet1.write(0,0,"源地址")
sheet1.write(0,1,"跳转地址")
sheet1.write(0,2,"状态码")
sheet1.write(0,3,"标题")
myxls.save(savefilename+'.xls')

#url.txt中ip:port格式转换成http、https格式，保存到url-run.txt中
with open("url.txt","r") as f:
    line = f.readlines()

with open("url-run.txt","w") as f2:
    for i in line:
        i=i.strip('\n')
        if 'http://' not in i and 'https://' not in i:       
            f2.write('http://'+i+'\n')
            f2.write('https://'+i+'\n')
        else:
            f2.write(i+'\n')


#获取状态码、标题
header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
    }

def get_codetitle(url):
    code = "无法访问"
    title = " "
    resurl = " "
    try:
        urllib3.disable_warnings()
        res = requests.get(url, headers=header, verify=False, allow_redirects=True, timeout=(3,9))
        res.encoding = res.apparent_encoding
        code = res.status_code
        title = re.findall("(?<=\<title\>)(?:.|\n)+?(?=\<)", res.text, re.IGNORECASE)[0].strip()
        resurl = res.url
    except Exception as error:
        pass
    return resurl,code,title

def write(url):
    codetitle = get_codetitle(url)
    resurl=str(codetitle[0])
    code=str(codetitle[1])
    title=str(codetitle[2])
    print(url+ "|" +resurl+ "|" +code+ "|" +title)
    with lock:
        word_book = xlrd.open_workbook(savefilename+'.xls')    
        sheets = word_book.sheet_names()
        work_sheet = word_book.sheet_by_name(sheets[0])
        old_rows = work_sheet.nrows
        heads = work_sheet.row_values(0)
        new_work_book = copy(word_book)
        new_sheet = new_work_book.get_sheet(0)
        i = old_rows
        new_sheet.write(i, 0, url)
        new_sheet.write(i, 1, resurl)
        new_sheet.write(i, 2, code)
        new_sheet.write(i, 3, title)
        new_work_book.save(savefilename+'.xls')    
    

#获取url列表
with open('url-run.txt', 'r', encoding='utf-8') as f:
    urls_data = [data.strip().strip('\\') for data in f] 
#多线程
with ThreadPoolExecutor(max_workers=100) as executor:
    for urls in urls_data:
        executor.submit(
            write, url=urls
        )

end = time.time()
print("总耗时:",end - start,"秒")