1.原谅我是个俗人那...
2.看网上那么多 python 爬取妹子图的,我凑个热闹喔...
3.有不对的 地发,欢迎指正...在此谢过啦....
4. 我的代码 可能 与网上的一些代码有些差异,也正常... 恩,要是 过了 段时间,这代码不能运行了,也不能怪我代码没写好喔.
5. 我的 python 代码,存放 图片的地址,是绝对路径,在ubuntu 下的...至于python 版本嘛. 是 2.7.12没错的...
6. 好了,不废话了,直接上代码:
7. 喔,还有,请自己 看懂代码喔...
#!/usr/bin/env python
#coding=utf-8
import requests
import os
import re
from bs4 import BeautifulSoup
import argparse
import hashlib
import base64
import gzip
import time
import io
class jiandanSpider(object):
headers = {"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
def md5(self,src):
m = hashlib.md5()
m.update(src.encode('utf-8'))
return m.hexdigest()
def decode_base64(self,url):
return base64.b64decode(url).decode('utf-8')
def get_raw_html(self,url):
html = requests.get(url,headers = self.headers)
return html.content
def get_soup(self,html):
soup = BeautifulSoup(html, 'lxml')
return soup
def get_hashes(self,soup, html):
hashes = []
for each in soup.find_all(class_='img-hash'):
url = self.decode_base64(each.text)
hashes.append(url)
return hashes
def get_timestr(self):
time_now = int(time.time())
time_local = time.localtime(time_now)
dt = time.strftime("%Y-%m-%d_%H:%M:%S",time_local)
return dt
def download_images(self,urls,dirpath):
if not os.path.exists(dirpath):
os.makedirs(dirpath)
for url in urls:
url = "http:" + url
response = requests.get(url, self.headers)
img = response.content
suffix = url.split('.')
suffix = suffix[len(suffix) - 1]
print(suffix)#图片后缀啦
filename = str(self.get_timestr())
time.sleep(1)#休眠 1s ,防止封地址
with open(dirpath +'/' + filename + '.' + suffix, 'wb') as f:
f.write(img)
#
def get_maxpages(self,soup):
page = soup.find_all('span',class_='current-comment-page')
pattern = re.compile(r'\d+')
m = pattern.search(str(page))
if m:
print(m.group(0))
else:
print('< --- not found ---->')
return 0
max_page = int(m.group(0))
return max_page
def sublist_format(self,max_page):
subl_list=[]
for i in range(max_page):
s = 'http://jandan.net/ooxx/page-{}#comments'.format(i)
subl_list.append(s)
return subl_list
def spider(self,url,dirpath, page=10):
html = self.get_raw_html(url)
soup = self.get_soup(html)
#print(self.get_maxpages(soup))
max_pages = self.get_maxpages(soup)#在 当前页面,即是 最大 页面数
url_list = self.sublist_format(max_pages)#生成 各 页面 url 地址
for index in range(max_pages):#然后 循环读取url 地址,解析,保存
html = self.get_raw_html(url_list[index])
soup = self.get_soup(html)
urlspath = self.get_hashes(soup,html)
self.download_images(urlspath,dirpath)
pass
if __name__ == '__main__':
#start crawling
url = 'http://jandan.net/ooxx/'
dirpath = '/home/menethis/work/Test/pythonTest/meizhi'#这个是我存的地址
jiandan = jiandanSpider()
jiandan.spider(url,dirpath)
恩,没错,是煎蛋 网的妹子,很正的喔....
你要是 不喜欢的话(你真的 不喜欢嘛..)
没关系 ,我这里还有 抓取 抖图 的 python 代码...
同样的,真伪,自己去辨别咯.....
就在此 贴上:欢迎指正 我 ....啦啦啦:
# -*- coding:utf-8 -*-
import requests
from bs4 import BeautifulSoup
import os
class doutuSpider(object):
headers = {
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"}
def get_url(self,url):
data = requests.get(url, headers=self.headers)
soup = BeautifulSoup(data.content,'lxml')
totals = soup.findAll("a", {"class": "list-group-item"})
for one in totals:
sub_url = one.get('href')
global path
path = '/home/menethis/work/Test/pythonTest'+ '/image/'+sub_url.split('/')[-1]# save image path
os.mkdir(path)
try:
self.get_img_url(sub_url)
except:
pass
def get_img_url(self,url):
data = requests.get(url,headers = self.headers)
soup = BeautifulSoup(data.content, 'lxml')
totals = soup.find_all('div',{'class':'artile_des'})
for one in totals:
img = one.find('img')
try:
sub_url = img.get('src')
except:
pass
finally:
urls = sub_url#'http:' +
try:
self.get_img(urls)
except:
pass
def get_img(self,url):
filename = url.split('/')[-1]
global path
img_path = path+'/'+filename#...
print(url)
img = requests.get(url,headers=self.headers)
try:
with open(img_path,'wb') as f:
f.write(img.content)
except:
pass
def create(self):
for count in range(1, 31):
url = 'https://www.doutula.com/article/list/?page={}'.format(count)
print '开始下载第{}页'.format(count)
self.get_url(url)
if __name__ == '__main__':
doutu = doutuSpider()
doutu.create()
以上所有:
均有出路,给出链接:
1.煎蛋网 OOXX 妹子图爬虫(2)——多线程+多进程下载图片
如有 侵权,请作者 联系我删除~~ ... 呵呵哒 ....