python 福利吧_简单的福利吧爬虫

最新推荐文章于 2024-01-07 00:13:27 发布

weixin_39819974

最新推荐文章于 2024-01-07 00:13:27 发布

阅读量3.7k

点赞数

文章标签： python 福利吧

本文链接：https://blog.csdn.net/weixin_39819974/article/details/113966405

版权

import requests

import random

import re

from lxml import etree

start_url = 'https://fuliba2020.net/2020'

import time

def get_headers(url, use='pc'):

pc_agent = [

"Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_8; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",

"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0);",

"Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",

"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.6; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",

"Mozilla/5.0 (Windows NT 6.1; rv:2.0.1) Gecko/20100101 Firefox/4.0.1",

"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",

"Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11",

"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",

"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",

"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Safari/537.36"

"Mozilla/5.0 (X11; Linux x86_64; rv:76.0) Gecko/20100101 Firefox/76.0"

]

phone_agent = [

"Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",

"Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",

"MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",

"Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",

"Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",

"Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",

"Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",

"Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",

"Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",

"UCWEB7.0.2.37/28/999",

# "NOKIA5700/ UCWEB7.0.2.37/28/999",

"Openwave/ UCWEB7.0.2.37/28/999",

"Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"

]

"""user_agent部分来源:https://blog.csdn.net/IT__LS/java/article/details/78880903"""

referer = lambda url: re.search(

"^((http://)|(https://))?([a-zA-Z0-9]([a-zA-Z0-9\-]{0,61}[a-zA-Z0-9])?\.)+[a-zA-Z]{2,6}(/)", url).group()

"""正则来源:https://www.cnblogs.com/blacksonny/p/6055357.html"""

if use == 'phone': # 随机选择一个

agent = random.choice(phone_agent)

else:

agent = random.choice(pc_agent)

headers = {

'User-Agent': agent,

'Referer': referer(url),

'DNT': "1",

'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',

'Connection': 'keep-alive',

'Accept-Language': 'zh-CN,zh;q=0.9,en-CN;q=0.8,en;q=0.7',

'Accept-Encoding': 'gzip, deflate, br',

}

return headers

#获取图片地址，存入txt

def creat_url(origin_url):

url_list = []

real_list = []

for i in range(1,149):

if i < 10:

order = '00'+str(i)

if i < 100 and i>9:

order = '0' + str(i)

if i > 99:

order = str(i)

for j in range(2,4):

url_list.append(origin_url+order+'.html'+'/'+str(j))

for i in url_list:

response = requests.get(i)

if not len(response.history) >= 1:

print('正在写入'+str(i))

with open('./url.txt', 'a') as f:

f.write(i+'\n')

#读取txt，去除换行，存入数组

def read_txt():

read_list = []

with open('./url.txt','r') as f:

for i in f.readlines():

read_list.append(i[:-1])

return read_list

def get_imgurl(read_list):

new_list = []

for url in read_list:

response = requests.get(url, get_headers(url))

html = etree.HTML(response.text)

result = html.xpath('/html/body/section/div[1]/div/article/p/img/@src')

for img_url in result:

if img_url not in new_list:

new_list.append(img_url)

with open('./image.txt','a') as f:

print('正在写入'+img_url)

f.write(img_url+'\n')

def download_img():

calacute = 1

read_list = []

with open('./image.txt','r') as f:

for i in f.readlines():

read_list.append(i[:-1])

for img_url in read_list:

try:

response = requests.get(img_url, headers=get_headers(img_url))

if str(response.status_code) =='200':

path = './images/' + img_url[30:]

with open(path, 'wb')as f:

f.write(response.content)

print('图片'+img_url+'下载成功'+'----'+'当前进度[{}/{}]'.format(str(calacute),str(len(read_list))))

calacute += 1

except Exception as e:

print(e)

time.sleep(1)

if __name__ == '__main__':

download_img()

weixin_39819974

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python 福利吧_简单的福利吧爬虫

import requestsimport randomimport refrom lxml import etreestart_url = 'https://fuliba2020.net/2020'import timedef get_headers(url, use='pc'):pc_agent = ["Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_...
复制链接

扫一扫