Python爬虫 基于代理IP 获取爬取豆瓣[爱旅行爱摄影]小组前50页帖子中的所有图片
import datetime
import os
import random
import re
import requests
from bs4 import BeautifulSoup
cookies = {
'bid': 'IugKbNlTOO4',
'__gads': 'ID=17955457aba85bd2-22174e75a3c9000b:T=1624175488:RT=1624175488:S=ALNI_MbG0yWu0hC8t-ufy4RFKHpUNcdFHw',
'__utmc': '30149280',
'__utmz': '30149280.1624190018.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none)',
'dbcl2': '192423504:3H2QqpYswqY',
'ck': 'xOL4',
'push_noty_num': '0',
'push_doumail_num': '0',
'__utmv': '30149280.19242',
'douban-fav-remind': '1',
'ap_v': '0,6.0',
'_pk_ses.100001.8cb4': '*',
'__utma': '30149280.536624023.1624190018.1624194706.1624201027.3',
'__utmt': '1',
'_pk_id.100001.8cb4': 'a5fdf088fcd7b731.1624190015.3.1624201032.1624195863.',
'__utmb': '30149280.21.0.1624201032806',
}
headers = {
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.108 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Referer': 'https://www.douban.com/group/lvxing/discussion?start=325',
'Accept-Language': 'zh-CN,zh;q=0.9',
}
"""
获取代理ip池
url 获取代理ip的连接 使用的是芝麻代理
"""
def get_ip_list(url):
web_data = requests.get(url)
soup = BeautifulSoup(web_data.text, 'lxml')
ips = soup.select("p")
split = ips[0].string.split("\r\n")
ip_list = []
for i in range(0, len(split)):
ip_list.append(split[i])
return ip_list
"""
随机选择一个代理ip
"""
def get_random_ip(ip_list):
proxy_list = []
for ip in ip_list:
proxy_list.append('http://' + ip)
proxy_ip = random.choice(proxy_list)
proxies = {'http': proxy_ip}
return proxies
"""
imgs 帖子内所有的图片标签
name 帖子名称
time 帖子时间
Proxy_list 代理ip集合
下载图片并保存
"""
def save_img(imgs,name,time,Proxy_list):
for link in imgs:
url = link.get('src')
proxies = get_random_ip(Proxy_list)
resp = requests.get(url,headers=headers,cookies=cookies,proxies=proxies)
content=resp.content
print(resp.status_code)
size=len(content)
if size > 10*1024:
path = "D:/pachong/"+time+"/"+name
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
imgName = path +"/%d" % size + ".webp"
open(imgName,'wb').write(content)
print("图片下载完毕" + imgName)
"""
tds 保存有图片标签的列表项集合
获取帖子名称 时间和所有的图片标签
"""
def get_img(tds):
for td in tds:
url = td.select('a')[0].get('href')
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
ProxyUrl = "http://webapi.http.zhimacangku.com/getip?num=40&type=3&pro=&city=0&yys=0&port=11&time=3&ts=0&ys=0&cs=0&lb=1&sb=0&pb=45&mr=1®ions=";
Proxy_list = get_ip_list(ProxyUrl)
proxies = get_random_ip(Proxy_list)
resp = requests.get(url, headers=headers, cookies=cookies, proxies=proxies)
print(resp.status_code)
soup = BeautifulSoup(resp.content,'lxml')
imgs = soup.select('.image-wrapper img')
name = soup.select('.article h1')[0].string.strip().replace("\n","").replace(".","_")
rstr = r"[\/\\\:\*\?\"\<\>\|]"
name = re.sub(rstr,"_",name)
time = soup.select('.create-time')[0].string
time = datetime.datetime.strptime(time, '%Y-%m-%d %H:%M:%S').strftime('%Y%m%d')
print("开始下载这个帖子图片:"+name)
save_img(imgs,name,time,Proxy_list)
print(name+"帖子图片下载完毕")
"""
page 页码
获取所有包含帖子连接的td
"""
def start_page(page):
requests.adapters.DEFAULT_RETRIES = 5
s = requests.session()
s.keep_alive = False
page=(page-1)*25
url = 'https://www.douban.com/group/lvxing/discussion?start={}'.format(page)
resp = requests.get(url, headers=headers,cookies=cookies, timeout=10)
print(resp.status_code)
soup = BeautifulSoup(resp.content,'lxml')
table = soup.find(name="table", attrs={"class" :"olt"})
tds = table.select('.title')
get_img(tds)
if __name__ == '__main__':
for page in range(0, 51):
print("开始爬取第%d" % page + "页-----------------------")
start_page(page)
print("第%d" % page + "页结束-----------------------")