简介:爬取 http://www.girl13.com 图片
self.time = 2 # 设置间隔时间,默认时间为2s,以防止封IP
第一个版本
import os
import time
import requests
import threading
from bs4 import BeautifulSoup
class Girl13(object):
def __init__(self):
self.session = requests.session()
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
}
self.time = 2 # 设置间隔时间
# 获取状态
def get_status(self, url):
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
print("ERROR: 网络连接失败!")
return False
# 首页,建立连接
def get_index(self, url):
response = self.get_status(url)
if response:
# response.encoding = "utf-8"
# html = response.text
# print(html)
print("首页,建立连接...")
return True
else:
print("ERROR: 首页访问失败!")
return False
# 解析
def parse_html(self, url):
title_url = {}
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
columns = html.select("#loop-square .column-post")
for column in columns:
title = column.select(".entry-title")[0].text if column.select(".entry-title") else None
img_url = column.select(".entry-content.cf img")[0].get("src") \
if column.select(".entry-content.cf img") else None
# print(title, img_url)
if not title:
continue
title = os.path.basename(img_url)
title_url[title] = img_url
return title_url
# 获取最后一页
def get_last_page(self, url):
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
pages = html.select(".page-navigator li > a")
if pages[-1].text == "下一页":
last_page = pages[-2].text
else:
last_page = pages[-2].text
return int(last_page)
# 翻页
@staticmethod
def next_page(last_page):
for i in range(1, last_page + 1):
# url = "https://www.mzitu.com/zipai/comment-page-376"
url = "http://www.girl13.com/page/{}".format(i)
# print(url)
yield url
# 下载
def download(self, path, url):
print(url)
with open(path, "wb") as f:
response = self.get_status(url)
content = response.content
f.write(content)
def main_(self):
# 首页,建立连接
url = "http://www.girl13.com"
if not self.get_index(url):
return None
# 获取最后一页
url = "http://www.girl13.com/page/1"
last_page = self.get_last_page(url)
if not last_page:
return None
path = os.path.abspath(os.path.join(os.getcwd(), "image"))
if not os.path.exists(path):
os.mkdir(path)
# 翻页
urls = self.next_page(last_page)
for url in urls:
title_url = self.parse_html(url)
thread_list = []
for title in title_url:
path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
url = title_url[title]
t = threading.Thread(target=self.download, args=(path, url))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
time.sleep(self.time)
def main(self):
t = threading.Thread(target=self.main_)
t.daemon = True
t.start()
t.join()
if __name__ == '__main__':
girl = Girl13()
girl.main()
第二个版本
修复第一个版本因意外原因导致退出的问题
# -*- coding: utf-8 -*-
import os
import time
import requests
import threading
from bs4 import BeautifulSoup
class Girl13(object):
def __init__(self):
self.session = requests.session()
self.headers = {
"Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64)"
" AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.10 Safari/537.36"
}
self.time = 2 # 设置间隔时间
def get_status(self, url):
"""
获取状态
:param url: 访问地址
:return: 如果状态码200返回response,否则返回False
"""
try:
response = self.session.get(url, headers=self.headers)
if response.status_code == 200:
return response
else:
print("ERROR: 网络连接失败!")
return False
except requests.exceptions.ConnectionError:
print("由于连接方在一段时间后没有正确答复或连接的主机没有反应,连接尝试失败")
def get_index(self, url):
"""
首页,建立连接
:param url: 首页地址,建立 session 会话链接
:return: 首页可访问,返回True,否则返回False
"""
response = self.get_status(url)
if response:
# response.encoding = "utf-8"
# html = response.text
# print(html)
print("首页,建立连接...")
return True
else:
print("ERROR: 首页访问失败!")
return False
def parse_html(self, url):
"""
解析网页,获取图片地址和图片名
:param url: 当前网页地址
:return: 返回一个字典
"""
title_url = {}
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
columns = html.select("#loop-square .column-post")
for column in columns:
title = column.select(".entry-title")[0].text if column.select(".entry-title") else None
img_url = column.select(".entry-content.cf img")[0].get("src") \
if column.select(".entry-content.cf img") else None
# print(title, img_url)
if not title:
continue
try:
title = os.path.basename(img_url)
title_url[title] = img_url
except TypeError:
print("ERROR:", img_url)
return title_url
def get_last_page(self, url):
"""
获取最后一页的页数,用于分析出最后一页的页码
:param url: 第一页的url
:return: 返回 int 型页数
"""
response = self.get_status(url)
if not response:
return None
html = BeautifulSoup(response.text, "html5lib")
pages = html.select(".page-navigator li > a")
if pages[-1].text == "下一页":
last_page = pages[-2].text
else:
last_page = pages[-2].text
return int(last_page)
@staticmethod
def next_page(last_page):
"""
进行翻页
:param last_page: 传入最后一页的页数
:return: yield 生成器
"""
for i in range(1, last_page + 1):
# url = "https://www.mzitu.com/zipai/comment-page-376"
url = "http://www.girl13.com/page/{}".format(i)
# print(url)
yield url
def download(self, path, url):
"""
下载图片
:param path: 保存地址
:param url: 图片url
:return:
"""
# print(url)
with open(path, "wb") as f:
response = self.get_status(url)
content = response.content
f.write(content)
def main_(self):
# 首页,建立连接
url = "http://www.girl13.com"
if not self.get_index(url):
return None
# 获取最后一页
url = "http://www.girl13.com/page/1"
last_page = self.get_last_page(url)
if not last_page:
return None
path = os.path.abspath(os.path.join(os.getcwd(), "image"))
if not os.path.exists(path):
os.mkdir(path)
# 翻页
urls = self.next_page(last_page)
for url in urls:
print(url)
title_url = self.parse_html(url)
thread_list = []
for title in title_url:
path = os.path.abspath(os.path.join(os.getcwd(), "image", title))
url = title_url[title]
t = threading.Thread(target=self.download, args=(path, url))
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
time.sleep(self.time)
def main(self):
t = threading.Thread(target=self.main_)
t.daemon = True
t.start()
t.join()
if __name__ == '__main__':
girl = Girl13()
girl.main()