最后的脚本环节
要求是:爬取网页下所有的pdf,从pdf的所有单词中找到符合条件的那个
注意点(对我而言):访问网页后,返回报文中的链接格式是
href="2/index.html"与href="eccbc87e4b5ce2fe28308fd9f2a7baf3.pdf"
import re
import requests
import hashlib
import sys
import os
import PyPDF2
url0="http://61.147.171.105:54039"
def get_word(txt):
word_match=r'([\w]+)'
word_matchs=re.finditer(word_match,txt)
for word in word_matchs:
word=word.group(1)+"Salz!"
hashed_word = hashlib.sha1(word.encode('utf-8')).hexdigest()
if hashed_word == "3fab54a50e770d830c0416df817567662a9dc85c":
print(word)
sys.exit()
def get_pdf(txt,url):
pdf_match=r'href="([0-9a-fA-F/]+.pdf)"'
pdf_matches = re.finditer(pdf_match, txt)
url_match = r'(http://61.147.171.105:54039/[0-9/]+)index.html'
url_matchs= re.finditer(url_match,url)
for url in url_matchs:
for pdf in pdf_matches:
pdf=url.group(1)+pdf.group(1)
response = requests.get(pdf)
filename = os.path.basename(pdf)
with open(filename, 'wb') as f:
f.write(response.content)
with open(filename, 'rb') as pdf_file:
pdf_reader = PyPDF2.PdfReader(pdf_file)
for page_num in range(len(pdf_reader.pages)):
page = pdf_reader.pages[page_num]
text = page.extract_text()
get_word(text)
def get_url(url):
url_match = r'href="([0-9/]+)index.html"'
request = requests.get(url)
new_url_matches = re.finditer(url_match, request.text)
parent_url_match = r'http://61.147.171.105:54039/([0-9/]+)index.html'
get_pdf(request.text,url)
if url=="http://61.147.171.105:54039":
for new_url in new_url_matches:
get_url("http://61.147.171.105:54039/"+new_url.group(1)+"index.html")
else:
parent_url_matches = re.finditer(parent_url_match, url)
for parent_url in parent_url_matches:
for new_url in new_url_matches:
new_url=parent_url.group(1)+new_url.group(1)
get_url("http://61.147.171.105:54039/"+new_url+"index.html")
get_url(url0)
第一次接触爬文件,写得基础但略繁琐,勿喷,望指正