攻防世界FlatScience

来杯热带水果

已于 2024-03-29 16:57:23 修改

阅读量180

点赞数 1

文章标签： web

于 2024-03-29 16:48:46 首次发布

本文链接：https://blog.csdn.net/m0_74445847/article/details/137149723

版权

最后的脚本环节

要求是：爬取网页下所有的pdf，从pdf的所有单词中找到符合条件的那个

注意点(对我而言)：访问网页后，返回报文中的链接格式是

href="2/index.html"与href="eccbc87e4b5ce2fe28308fd9f2a7baf3.pdf"

import re
import requests
import hashlib
import sys
import os
import PyPDF2

url0="http://61.147.171.105:54039"

def get_word(txt):
    word_match=r'([\w]+)'
    word_matchs=re.finditer(word_match,txt)
    for word in word_matchs:
        word=word.group(1)+"Salz!"
        hashed_word = hashlib.sha1(word.encode('utf-8')).hexdigest()
        if hashed_word == "3fab54a50e770d830c0416df817567662a9dc85c":
            print(word)
            sys.exit()
def get_pdf(txt,url):
    pdf_match=r'href="([0-9a-fA-F/]+.pdf)"'
    pdf_matches = re.finditer(pdf_match, txt)
    url_match = r'(http://61.147.171.105:54039/[0-9/]+)index.html'
    url_matchs= re.finditer(url_match,url)
    for url in url_matchs:
        for pdf in pdf_matches:
            pdf=url.group(1)+pdf.group(1)
            response = requests.get(pdf)
            filename = os.path.basename(pdf)
            with open(filename, 'wb') as f:
                f.write(response.content)
            with open(filename, 'rb') as pdf_file:
                pdf_reader = PyPDF2.PdfReader(pdf_file)
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text = page.extract_text()
                    get_word(text)

def get_url(url):
    url_match = r'href="([0-9/]+)index.html"'
    request = requests.get(url)
    new_url_matches = re.finditer(url_match, request.text)
    parent_url_match = r'http://61.147.171.105:54039/([0-9/]+)index.html'
    get_pdf(request.text,url)
    if url=="http://61.147.171.105:54039":   
        for new_url in new_url_matches:
                get_url("http://61.147.171.105:54039/"+new_url.group(1)+"index.html")
    else:    
        parent_url_matches = re.finditer(parent_url_match, url)
        for parent_url in parent_url_matches: 
            for new_url in new_url_matches:
                new_url=parent_url.group(1)+new_url.group(1)
                get_url("http://61.147.171.105:54039/"+new_url+"index.html")

get_url(url0)

第一次接触爬文件，写得基础但略繁琐，勿喷，望指正