-
说明:对于譬如百度爬虫能爬到的页面内容,此python爬虫不当作破解活动。
-
封装好的单线程爬取文件的python3爬虫:
1. sleep延时对对方服务器友好,不至于拖死下行带宽;
2. 单线程更好的控制文件下载断点;
3. 同步特性完成文件的批量下载。
-
全部代码如下:
(使用过程中会有一小部分的文件下载为空或者下载报错跳过,可能原因是对方服务器文件编码错误或本地电脑内容爆了,但不至于抓取过程断了,一律当作跳过处理)
import requests # pip install requests
from bs4 import BeautifulSoup
import urllib.request
import os
import sys
import re
import time
import _thread
import chardet
from urllib import parse
import random
# 向txt文本写入内容
def write_txt(filename, info):
with open(filename, 'a', encoding='utf-8') as txt:
txt.write(info + '\n\n')
pass
pass
# GET请求
def request_get(get_url=''):
get_response = requests.get(get_url)
res = get_response.text
return res
pass
# POST请求
def request_post(post_url='', data_dict=None):
if data_dict is None:
data_dict = {'test': 'my test-post data', 'create_time': '2019'} # data示例字典格式
res = requests.post(url=post_url, data=data_dict, headers={'Content-Type': 'application/x-www-form-urlencoded'})
return res
pass
# 使用代理(GET)请求接口
def use_proxy_request_api(url, proxy_addr='122.241.72.191:808'):
req = urllib.request.Request(url)
req.add_header("User-Agent",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0")
proxy = urllib.request.ProxyHandler({'http': proxy_addr})
opener = urllib.request.build_opener(proxy, urllib.request.HTTPHandler)
urllib.request.install_opener(opener)
res = urllib.request.urlopen(req).read().decode('utf-8', 'ignore')
return res
pass
# ############################################################################
# 获取url参数
def get_url_param(url='', key=''):
array = parse.parse_qs(parse.urlparse(url).query)
return array[key]
pass
# 获取url网页
def get_url_html(url, state=0):
if state == 0:
url = domain + url
pass
else:
url = url
pass
# 获取网页
# 任意请求头
hearder_list = [
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.16 Safari/537.36 Edg/80.0.361.9'},
{
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'},
{
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.3 (KHTML, like Gecko) Version/10.0 Mobile/14E304 Safari/602.1 wechatdevtools/1.02.1910120 MicroMessenger/7.0.4 Language/zh_CN webview/15780410115046065 webdebugger port/41084'},
{'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:71.0) Gecko/20100101 Firefox/71.0'},
{'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:68.0) Gecko/20100101 Firefox/68.0'},
]
index = random.randint(0, len(hearder_list) - 1)
headers = hearder_list[index]
req = urllib.request.Request(url=url, headers=headers)
try:
response = urllib.request.urlopen(req)
pass
except:
return
pass
try:
page = response.read().decode('utf-8') # 编码格式gb2312,utf-8,GBK
pass
except:
page = response.read().decode('gb2312') # 编码格式gb2312,utf-8,GBK
pass
html_string_page = str(page) # 转换成string,可以直接向数据库添加
soup_page = BeautifulSoup(html_string_page, "html.parser") # 解析网页标签
return soup_page
pass
# 抓取 文章内容
def get_html3(url, class_name, title):
soup_page = get_url_html(url, 1)
print(url)
if soup_page is None:
print('break')
sys.exit()
pass
# print(soup_page)
a = soup_page.find('a', attrs={'class', 'article-download'})
file_href = a.get('href')
file_info = file_href.split(".")
try:
file_info_path = file_info[-2].split("/")
# print(a)
print(title)
print(file_href)
print(file_info_path[-1])
print(file_info[-1])
root = "D:/python38/demo/wendu_file/" # 没有最后一级文件夹目录则会自动创建
path = root + class_name + '_' + title + '_' + file_info_path[-1] + '.' + file_info[-1] # 文件绝对路径
# 保存文件
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r = requests.get(file_href)
r.raise_for_status()
# 使用with语句可以不用自己手动关闭已经打开的文件流
with open(path, "wb") as f: # 开始写文件,wb代表写二进制文件
f.write(r.content)
print("保存文件成功=")
else:
print("文件已存在")
except Exception as e:
print("文件保存失败:" + str(e))
pass
except Exception as e:
print("文件失效:" + str(e))
# 保存
# post_url = 'app/save_article'
# post_data = {
# 'course_class_id': pre_class_id, # 对应id,手动
# 'class_name': class_name,
# 'title': title,
# 'url': url,
# 'content': content,
# 'description': description,
# 'div_content': div_content,
# }
# # print(post_data)
# res = request_post(api + post_url, post_data)
# print(res)
pass
# 抓取 目录
def get_html2(url):
soup_page = get_url_html(url, 1)
print(url)
if soup_page is None:
sys.exit()
pass
that_div = soup_page.find('div', attrs={'class', 'date-word'}).find_all('div', attrs={'class', 'date-load'})
that_h = soup_page.find_all('a', attrs={'class', 'current'})[1].get_text()
if that_h is None:
that_h = '全部'
# print(that_div)
# print(that_h)
for j in range(0, len(that_div)-1):
print('===' + str(j) + '===')
div = that_div[j]
a = div.find('div', attrs={'class', 'date-load-fl'}).find('a')
# a_txt = a.get_text()
a_txt = a.get_text()
a_href = a.get('href')
# print(a)
# print([a_txt, a_href, that_h])
get_html3(a_href, pre_class_name + that_h, a_txt)
time.sleep(0.8)
pass
'参数'
api = 'http://192.168.131.129/pydata/public/index.php/api/'
domain = 'http://xxxxx' # 主网址
pre_class_id = 9
pre_class_name = '医考_'
if __name__ == '__main__': # 函数执行入口
print('---开始---')
# 在此启动函数
url = 'https://www.wendu.com/index.php?m=content&c=index&a=lists&catid=302&siteid=1&page='
for a in range(1, 50):
_url = url + str(a)
# print(_url)
get_html2(_url)
time.sleep(2)
pass
print('---完成---')
pass
-
-
-