这里还是为了温习,这里照例为了放水,涉及网址的地方,采取了url加密,一是scrapy,二是requests,首先是requests方法:
import requests
import re
import os
import base64
from lxml import etree
from urllib.parse import urljoin
def get_text(url):
headers={'User-Agent':
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',
'Referrer':b'aHR0cHM6Ly9waXhhYmF5LmNvbS96aC8='}
try:
response = requests.get(url, headers=headers, timeout=30)
response.encoding = 'utf-8'
return response.text
except requests.RequestException as err:
print(err)
return ''
def get_urls(url_template, length):
return [url_template.format(i) for i in range(1,length+1)]
def img_download(img):
file_path = os.getcwd()
directory = os.path.join(file_path, '\\Pixabay\\Wallpaper')
os.chdir(directory)
name = img.split('/')[-1]
try:
with open(name, 'wb') as Image:
Image.write(requests.get(img).content)
except (requests.RequestException, FileExistsError) as err:
print(err)
pass
def clean_str(string):
return string.replace('340', '1280').replace('480', '1280').replace('__','_')
url_template = b'aHR0cHM6Ly9waXhhYmF5LmNvbS96aC9pbWFnZXMvc2VhcmNoLyVFNSVBMyU4MSVFNyVCQSVCOC8/cGFnaT17fQ=='
pattern1 = b'aHR0cHM6Ly9jZG5cLnBpeGFiYXlcLmNvbS9waG90by9cZCsvXGQrL1xkKy9cZCsvXGQrL1x3K1stXHcrXSstXGQrX19cZCtcLmpwZw=='
pattern2 = b'aHR0cHM6Ly9jZG5cLnBpeGFiYXlcLmNvbS9waG90by9cZCsvXGQrL1xkKy9cZCsvXGQrL1x3K1stXHcrXSstXGQrX1xkK1wuanBn'
length = 1
urls = get_urls(url_template, length)
for url in urls:
resp = get_text(url)
imgs= re.findall(pattern1, resp)
imgs = list(set(list(map(clean_str, imgs))))
for img in imgs:
img_download(img)
基本就完工了,之后再写scrapy