【自用】高举社会主义精神文明大旗,不建议推广!
解析html没有费多少精力,主要是下载失败太多,后来模拟了浏览器访问还是有很多失败,后来发现是Referer 头内容不合理导致的。修改后下载成功率 10页没有失败。
当然研究下载这里也费了些时间。
最后!自用!我强调了!自用!为了身体健康和精神文明!
此代码有版权!在发布250毫秒内禁止拷贝,传播,查看!! 250毫秒后版权过期。特此说明!
20170302 发现问题有时下载不了,原因发现,首次发送后服务器会返回某个Etag,要读取这个返回给服务器就能继续,否则就会出现读不到文件的错误。
看来要研究 http协议了!这个东西果然没那么简单!!
创建了一个server的字典。每次访问都记录下请求头和响应头,发现有Etag就反给服务器。成功率又高了好多。
代码无任何改动(除了图片文件保存地址)完整移植到了树莓派上!
需要注意到是 要安装树莓派版本的 lxml 包括两个相关组件( libxml2-dev
和 libxslt1-dev)
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# python 3.6 64/win10 64/
# 下一个目标是记录下载历史,下载过的图就不再下载了 ,就用 sqlite3 吧!
import urllib
import urllib.request as request
import io
import gzip
from lxml import etree # 需要lxml 组件 自己用pip下载
import time
import datetime
import random
import traceback
import sys
from urllib.request import FancyURLopener
def make_header(call_url):
"""构成仿造的浏览器头部提高 页面代码访问 成功率"""
referer = call_url
if call_url.find('#') > 0:
referer = call_url[:len(call_url) - 10]
return {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36",
"Accept": "*/*",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip,deflate",
"Referer": referer,
"Connection": "keep-alive",
"If-Modified-Since": "Mon, 08 Jul 2013 18:06:40 GMT",
"Cache-Control": "max-age=0"
}
def insert_header(header, img_url, call_url):
"""构成仿造的浏览器头部提高下载成功率"""
referer = call_url
if call_url.find('#') > 0:
referer = call_url[:len(call_url) - 9]
host = img_url[7:21]
header.addheader("Host", host)
header.addheader("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36")
if img_url.find('.gif') > 0:
header.addheader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8")
else:
header.addheader("Accept", "*/*")
header.addheader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3")
header.addheader("Accept-Encoding", "gzip,deflate")
header.addheader("Referer", referer)
header.addheader("Connection", "keep-alive")
header.addheader("Upgrade-Insecure-Requests", "1")
header.addheader("If-Modified-Since", """Sun, 26 Feb 2017 03: 53:17 GMT""")
header.addheader('If-None-Match', '"F57B886E1C77028F85FAA6F665CD559E"')
header.addheader("Cache-Control", "max-age=0")
# 打印请求头,测试用
for i in header.addheaders:
print(i.__str__())
return header
# 以下是几个公共参数
enter_url = 'http://jandan.net/ooxx'
enter_url1 = 'http://jandan.net/ooxx/page-2293#comments'
save_path = 'e:\\xpic\\'
download_fail_info = '... download fail'
# 最大访问次数,向后访问多少页 最大读取10页
max_times = 30
curr_times = 0
class header_key:
head_name = ''
header_value = ''
class server_info:
server_name = ''
server_request_header = {}
server_response_header = {}
server_infos = {}
frist_request_header = {'Host': '',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip,deflate',
'Referer': '',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'If-Modified-Since': 'Sun, 26 Feb 2017 03: 53:17 GMT',
'Cache-Control': 'max-age=0'}
# frist_request_header.fromkeys('Etag', '"2222"')
#
# frist_request_header['Etag'] = '"333"'
#
# for i in frist_request_header:
# print(i, '==', frist_request_header[i])
# 一个单独测试下载的
img_u = 'http://ww2.sinaimg.cn/mw600/95e71c7fgw1fbdijnrk35j20dw0himz9.jpg'
file_n = 'e:\\xpic\\95e71c7fgw1fbdijnrk35j20dw0himz9.jpg'
def get_host(url_str):
return url_str[7:url_str.find('/', 7)]
def get_referer_url(url_str):
referer = url_str
if url_str.find('#') > 0:
referer = url_str[:len(url_str) - 9]
return referer
def download_filex(html_url, img_url, file_name):
global server_infos
curr_host = get_host(img_url)
referer_url = get_referer_url(html_url)
s = server_info()
if curr_host not in server_infos:
s.server_name = curr_host
for k in frist_request_header:
v = frist_request_header[k]
s.server_request_header[k] = v
s.server_request_header['Host'] = curr_host
s.server_request_header['Referer'] = referer_url
server_infos[curr_host] = s
print(len(server_infos))
# print(s.server_request_header.__str__())
# print_server_list()
print('add ---- ', s.server_name, ' == ', s.server_request_header['Host'])
# print('add ---- ', s)
else:
s = server_infos[curr_host]
s.server_request_header['Referer'] = referer_url
print('load ---- ', curr_host)
print('load ---- ', s.server_name, ' == ', s.server_request_header['Host'])
req = request.Request(img_url)
for k in s.server_request_header:
v = s.server_request_header[k]
req.add_header(k, v)
# print('request = [%s , %s]' % (k, v))
with request.urlopen(req) as f:
print('Status:', f.status, f.reason)
for k, v in f.getheaders():
s.server_response_header[k] = v
if k == 'Etag':
s.server_request_header['If-None-Match'] = v
# print('response = [%s: %s]' % (k, v))
with open(file_name, 'wb+') as save_file:
save_file.write(f.read())
# download_filex(img_u, file_n)
def print_server_list():
global server_infos
print('------------ server info ------------')
for k in server_infos:
v = server_infos[k]
print('server = [%s]' % (k))
for xk in v.server_request_header:
xv = v.server_request_header[xk]
print('server_request = [%s : %s]'% (xk, xv))
def get_html(url):
"""这里主要是获取url的html文件内容,可以解压gzip的html"""
print('get ...... ', url)
curr_html_str = ''
try:
req = urllib.request.Request(url, headers=make_header(url))
bs = urllib.request.urlopen(req).read()
bi = io.BytesIO(bs)
gf = gzip.GzipFile(fileobj=bi, mode="rb")
curr_html_str = gf.read().decode("utf8")
except Exception as ex:
print(url, ' ...... ', ex.__str__())
finally:
pass
return curr_html_str
def make_rando():
"""生成一个随机 秒"""
sleep_time = random.randint(120, 180)
return sleep_time
def sleep():
"""用生成的随机秒 随眠"""
sleep_time_temp = make_rando()
now = datetime.datetime.now()
now_time = now.strftime('%Y-%m-%d %H:%M:%S.%f')
print(now_time, ' ... sleep ... ', sleep_time_temp)
print_server_list()
time.sleep(sleep_time_temp)
def get_img_url(call_url, html):
"""获取img图片的url 这里主要是获取 jpg一类静态图 并 下载"""
result = html.xpath('//img/@src')
for i in result:
img_url = i
if img_url[6:13] == 'sinaimg':
# 绕开 gif 面图,此图不动
if img_url.find('thumb') >= 0:
continue
img_url = 'http:' + img_url
file_name = save_path + img_url[img_url.rfind('/')+1:]
print(img_url, ' ----> ', file_name)
try:
# urllib.request.urlretrieve(img_url, new_name)
# download_file(call_url, img_url, file_name)
download_filex(call_url, img_url, file_name)
pass
except Exception as ex:
print(file_name, download_fail_info, ex.__str__())
finally:
pass
def get_gif_url(call_url, html):
"""这里主要获取 gif 动态图的地址 并 下载"""
result = html.xpath('//img/@org_src')
for i in result:
img_url = i
if img_url[6:13] == 'sinaimg':
img_url = 'http:' + img_url
file_name = save_path + img_url[img_url.rfind('/')+1:]
print(img_url, ' ----> ', file_name)
try:
# download_file(call_url, img_url, file_name)
download_filex(call_url, img_url, file_name)
pass
except Exception as ex:
ti = traceback.format_exc()
print(file_name, download_fail_info, ex.__str__(), ti)
finally:
pass
def download_file(call_url, img_url, file_name):
"""下载文件,模仿了火狐浏览器,成功几率已经非常高了"""
opener = FancyURLopener()
opener.addheaders.clear()
opener = insert_header(opener, img_url, call_url)
file_data = opener.open(img_url)
try:
with open(file_name, 'wb+') as save_file:
save_file.write(file_data.read())
except Exception as ex:
print(file_name, ' ...... write fail ', ex.__str__())
finally:
file_data.close()
def get_next_page(current_url):
"""入口函数,指定了入口页,就能继续下一页访问了"""
global curr_times
current_html_str = get_html(current_url)
if current_html_str is None or current_html_str == '':
return
current_html = etree.HTML(current_html_str)
#
get_img_url(current_url, current_html)
#
get_gif_url(current_url, current_html)
#
page_result = current_html.xpath('//a[@title="Older Comments"]/@href')
curr_times += 1
# 没有下一页或者达到最大次数了就停止了
if len(page_result) <= 0 or curr_times >= max_times:
return
next_page_url = page_result[0]
# 发生随机睡眠(假装我在看这一页内容)
sleep()
#
get_next_page(next_page_url)
# 功能执行
get_next_page(enter_url)
# 一个单独测试下载的
# img_u = 'http://ww2.sinaimg.cn/mw600/95e71c7fgw1fbdijnrk35j20dw0himz9.jpg'
# file_n = 'e:\\xpic\\95e71c7fgw1fbdijnrk35j20dw0himz9.jpg'
# download_file(img_u, file_n)