一,爬取图片
1,选择爬图片的网站,我们选择百度图片。
百度图片网址:https://image.baidu.com/
2,编写代码:
import os import time import requests import urllib3 urllib3.disable_warnings() # 进度条库 from tqdm import tqdm import os cookies = { 'BDqhfp': '%E7%8B%97%E7%8B%97%26%26NaN-1undefined%26%2618880%26%2621', 'BIDUPSID': '06338E0BE23C6ADB52165ACEB972355B', 'PSTM': '1646905430', 'BAIDUID': '104BD58A7C408DABABCAC9E0A1B184B4:FG=1', 'BDORZ': 'B490B5EBF6F3CD402E515D22BCDA1598', 'H_PS_PSSID': '35836_35105_31254_36024_36005_34584_36142_36120_36032_35993_35984_35319_26350_35723_22160_36061', 'BDSFRCVID': '8--OJexroG0xMovDbuOS5T78igKKHJQTDYLtOwXPsp3LGJLVgaSTEG0PtjcEHMA-2ZlgogKK02OTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5', 'H_BDCLCKID_SF': 'tJPqoKtbtDI3fP36qR3KhPt8Kpby2D62aKDs2nopBhcqEIL4QTQM5p5yQ2c7LUvtynT2KJnz3Po8MUbSj4QoDjFjXJ7RJRJbK6vwKJ5s5h5nhMJSb67JDMP0-4F8exry523ioIovQpn0MhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0D6bBjHujtT_s2TTKLPK8fCnBDP59MDTjhPrMypomWMT-0bFH_-5L-l5js56SbU5hW5LSQxQ3QhLDQNn7_JjOX-0bVIj6Wl_-etP3yarQhxQxtNRdXInjtpvhHR38MpbobUPUDa59LUvEJgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtD8bMC-RDjt35n-Wqxobbtof-KOhLTrJaDkWsx7Oy4oTj6DD5lrG0P6RHmb8ht59JROPSU7mhqb_3MvB-fnEbf7r-2TP_R6GBPQtqMbIQft20-DIeMtjBMJaJRCqWR7jWhk2hl72ybCMQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjHCet5DJJn4j_Dv5b-0aKRcY-tT5M-Lf5eT22-usy6Qd2hcH0KLKDh6gb4PhQKuZ5qutLTb4QTbqWKJcKfb1MRjvMPnF-tKZDb-JXtr92nuDal5TtUthSDnTDMRhXfIL04nyKMnitnr9-pnLJpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn02eCKuj6tWj6j0DNRabK6aKC5bL6rJabC3b5CzXU6q2bDeQN3OW4Rq3Irt2M8aQI0WjJ3oyU7k0q0vWtvJWbbvLT7johRTWqR4enjb3MonDh83Mxb4BUrCHRrzWn3O5hvvhKoO3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRCqVIKa3f', 'BDSFRCVID_BFESS': '8--OJexroG0xMovDbuOS5T78igKKHJQTDYLtOwXPsp3LGJLVgaSTEG0PtjcEHMA-2ZlgogKK02OTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5', 'H_BDCLCKID_SF_BFESS': 'tJPqoKtbtDI3fP36qR3KhPt8Kpby2D62aKDs2nopBhcqEIL4QTQM5p5yQ2c7LUvtynT2KJnz3Po8MUbSj4QoDjFjXJ7RJRJbK6vwKJ5s5h5nhMJSb67JDMP0-4F8exry523ioIovQpn0MhQ3DRoWXPIqbN7P-p5Z5mAqKl0MLPbtbb0xXj_0D6bBjHujtT_s2TTKLPK8fCnBDP59MDTjhPrMypomWMT-0bFH_-5L-l5js56SbU5hW5LSQxQ3QhLDQNn7_JjOX-0bVIj6Wl_-etP3yarQhxQxtNRdXInjtpvhHR38MpbobUPUDa59LUvEJgcdot5yBbc8eIna5hjkbfJBQttjQn3hfIkj0DKLtD8bMC-RDjt35n-Wqxobbtof-KOhLTrJaDkWsx7Oy4oTj6DD5lrG0P6RHmb8ht59JROPSU7mhqb_3MvB-fnEbf7r-2TP_R6GBPQtqMbIQft20-DIeMtjBMJaJRCqWR7jWhk2hl72ybCMQlRX5q79atTMfNTJ-qcH0KQpsIJM5-DWbT8EjHCet5DJJn4j_Dv5b-0aKRcY-tT5M-Lf5eT22-usy6Qd2hcH0KLKDh6gb4PhQKuZ5qutLTb4QTbqWKJcKfb1MRjvMPnF-tKZDb-JXtr92nuDal5TtUthSDnTDMRhXfIL04nyKMnitnr9-pnLJpQrh459XP68bTkA5bjZKxtq3mkjbPbDfn02eCKuj6tWj6j0DNRabK6aKC5bL6rJabC3b5CzXU6q2bDeQN3OW4Rq3Irt2M8aQI0WjJ3oyU7k0q0vWtvJWbbvLT7johRTWqR4enjb3MonDh83Mxb4BUrCHRrzWn3O5hvvhKoO3MA-yUKmDloOW-TB5bbPLUQF5l8-sq0x0bOte-bQXH_E5bj2qRCqVIKa3f', 'indexPageSugList': '%5B%22%E7%8B%97%E7%8B%97%22%5D', 'cleanHistoryStatus': '0', 'BAIDUID_BFESS': '104BD58A7C408DABABCAC9E0A1B184B4:FG=1', 'BDRCVFR[dG2JNJb_ajR]': 'mk3SLVN4HKm', 'BDRCVFR[-pGxjrCMryR]': 'mk3SLVN4HKm', 'ab_sr': '1.0.1_Y2YxZDkwMWZkMmY2MzA4MGU0OTNhMzVlNTcwMmM2MWE4YWU4OTc1ZjZmZDM2N2RjYmVkMzFiY2NjNWM4Nzk4NzBlZTliYWU0ZTAyODkzNDA3YzNiMTVjMTllMzQ0MGJlZjAwYzk5MDdjNWM0MzJmMDdhOWNhYTZhMjIwODc5MDMxN2QyMmE1YTFmN2QyY2M1M2VmZDkzMjMyOThiYmNhZA==', 'delPer': '0', 'PSINO': '2', 'BA_HECTOR': '8h24a024042g05alup1h3g0aq0q', } headers = { 'Connection': 'keep-alive', 'sec-ch-ua': '" Not;A Brand";v="99", "Google Chrome";v="97", "Chromium";v="97"', 'Accept': 'text/plain, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'sec-ch-ua-mobile': '?0', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36', 'sec-ch-ua-platform': '"macOS"', 'Sec-Fetch-Site': 'same-origin', 'Sec-Fetch-Mode': 'cors', 'Sec-Fetch-Dest': 'empty', 'Referer': 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=result&fr=&sf=1&fmq=1647837998851_R&pv=&ic=&nc=1&z=&hd=&latest=©right=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&dyTabStr=MCwzLDIsNiwxLDUsNCw4LDcsOQ%3D%3D&ie=utf-8&sid=&word=%E7%8B%97%E7%8B%97', 'Accept-Language': 'zh-CN,zh;q=0.9', } def craw_single_class(keyword, DOWNLOAD_NUM=200): if os.path.exists('dataset/' + keyword): print('文件夹 dataset/{} 已存在,之后直接将爬取到的图片保存至该文件夹中'.format(keyword)) folder_path = 'dataset/'+keyword files_and_folders = os.listdir('dataset/'+keyword) # 使用列表推导式筛选出是文件的项 files = [f for f in files_and_folders if os.path.isfile(os.path.join(folder_path, f))] # 计算文件的数量 file_count = len(files) print(keyword+ ':::' + str(file_count)) if file_count < 200: count = 1 with tqdm(total=DOWNLOAD_NUM, position=0, leave=True) as pbar: # 爬取第几张 num = 0 # 是否继续爬取 FLAG = True while FLAG: page = 30 * count params = ( ('tn', 'resultjson_com'), ('logid', '12508239107856075440'), ('ipn', 'rj'), ('ct', '201326592'), ('is', ''), ('fp', 'result'), ('fr', ''), ('word', f'{keyword}'), ('queryWord', f'{keyword}'), ('cl', '2'), ('lm', '-1'), ('ie', 'utf-8'), ('oe', 'utf-8'), ('adpicid', ''), ('st', '-1'), ('z', ''), ('ic', ''), ('hd', ''), ('latest', ''), ('copyright', ''), ('s', ''), ('se', ''), ('tab', ''), ('width', ''), ('height', ''), ('face', '0'), ('istype', '2'), ('qc', ''), ('nc', '1'), ('expermode', ''), ('nojc', ''), ('isAsync', ''), ('pn', f'{page}'), ('rn', '30'), ('gsm', '1e'), ('1647838001666', ''), ) response = requests.get('https://image.baidu.com/search/acjson', headers=headers, params=params, cookies=cookies) if response.status_code == 200: try: json_data = response.json().get("data") if json_data: for x in json_data: type = x.get("type") if type not in ["gif"]: img = x.get("thumbURL") fromPageTitleEnc = x.get("fromPageTitleEnc") try: resp = requests.get(url=img, verify=False) time.sleep(1) # print(f"链接 {img}") # 保存文件名 # file_save_path = f'dataset/{keyword}/{num}-{fromPageTitleEnc}.{type}' file_save_path = f'dataset/{keyword}/{num}.{type}' with open(file_save_path, 'wb') as f: f.write(resp.content) f.flush() # print('第 {} 张图像 {} 爬取完成'.format(num, fromPageTitleEnc)) num += 1 pbar.update(1) # 进度条更新 # 爬取数量达到要求 if num > DOWNLOAD_NUM: FLAG = False print('{} 张图像爬取完毕'.format(num)) break except Exception: pass except: pass else: break count += 1 else: os.makedirs('dataset/{}'.format(keyword)) print('新建文件夹:dataset/{}'.format(keyword)) count = 1 with tqdm(total=DOWNLOAD_NUM, position=0, leave=True) as pbar: # 爬取第几张 num = 0 # 是否继续爬取 FLAG = True while FLAG: page = 30 * count params = ( ('tn', 'resultjson_com'), ('logid', '12508239107856075440'), ('ipn', 'rj'), ('ct', '201326592'), ('is', ''), ('fp', 'result'), ('fr', ''), ('word', f'{keyword}'), ('queryWord', f'{keyword}'), ('cl', '2'), ('lm', '-1'), ('ie', 'utf-8'), ('oe', 'utf-8'), ('adpicid', ''), ('st', '-1'), ('z', ''), ('ic', ''), ('hd', ''), ('latest', ''), ('copyright', ''), ('s', ''), ('se', ''), ('tab', ''), ('width', ''), ('height', ''), ('face', '0'), ('istype', '2'), ('qc', ''), ('nc', '1'), ('expermode', ''), ('nojc', ''), ('isAsync', ''), ('pn', f'{page}'), ('rn', '30'), ('gsm', '1e'), ('1647838001666', ''), ) response = requests.get('https://image.baidu.com/search/acjson', headers=headers, params=params, cookies=cookies) if response.status_code == 200: try: json_data = response.json().get("data") if json_data: for x in json_data: type = x.get("type") if type not in ["gif"]: img = x.get("thumbURL") fromPageTitleEnc = x.get("fromPageTitleEnc") try: resp = requests.get(url=img, verify=False) time.sleep(1) # print(f"链接 {img}") # 保存文件名 # file_save_path = f'dataset/{keyword}/{num}-{fromPageTitleEnc}.{type}' file_save_path = f'dataset/{keyword}/{num}.{type}' with open(file_save_path, 'wb') as f: f.write(resp.content) f.flush() # print('第 {} 张图像 {} 爬取完成'.format(num, fromPageTitleEnc)) num += 1 pbar.update(1) # 进度条更新 # 爬取数量达到要求 if num > DOWNLOAD_NUM: FLAG = False print('{} 张图像爬取完毕'.format(num)) break except Exception: pass except: pass else: break count += 1 class_list = ['黄瓜','南瓜','冬瓜','木瓜', '苦瓜','丝瓜','窝瓜','甜瓜','香瓜','白兰瓜','黄金瓜','西葫芦','人参果','羊角蜜','佛手瓜','伊丽莎白瓜'] for each in class_list: craw_single_class(each, DOWNLOAD_NUM = 200)
我们如果想要爬取其他的图片,只需要在class_list=[],输入你想要的名字。
爬取如图所示:
二,爬取视频
1,如果我们想爬取视频,这里我选择爬取哔哩哔哩的视频。
2,这里的视频需要选择具体的视频URL。比如:https://www.bilibili.com/video/BV1XB4y1P7xd/?p=41&spm_id_from=pageDriver&vd_source=c8d8d36b71cc82aaa71fffb815f6ed01
3,构造针对该视频的代码,主要是对you-get库的使用。
import subprocess # 视频链接 # url = 'https://www.bilibili.com/video/BV1XB4y1P7xd?p=40&vd_source=c8d8d36b71cc82aaa71fffb815f6ed01' # 调用you-get下载视频 # -o 指定下载路径 # -O 指定输出文件名 # -d 指定下载整个视频列表 # -p 指定下载视频的清晰度 # --format 指定下载视频的格式 # --no-caption 不下载字幕 # --no-merge 不合并视频和音频 # --debug 输出调试信息 # --json 输出JSON格式的信息 # --info 输出视频信息 # --version 输出you-get版本信息 # --help 输出you-get帮助信息 import you_get.common as you_get import subprocess import os # 视频链接 def download_video_bilibili(url): global begin try: # url = 'https://www.bilibili.com/video/BV1XB4y1P7xd?p=16&vd_source=c8d8d36b71cc82aaa71fffb815f6ed01' # 设置下载目录 output_dir = r'D:\English_video' # 使用原始字符串 # 构建命令 command = f'you-get -o "{output_dir}" {url}' # 使用subprocess执行命令 subprocess.call(command, shell=True) # 获取下载的所有文件 downloaded_files = os.listdir(output_dir) # 遍历下载目录中的文件 for file_name in downloaded_files: # print('-------------------') # print(type(file_name)) file_path = os.path.join(output_dir, file_name) # 检查文件扩展名,这里假设主要视频文件是以 .mp4 结尾的 if (file_name.endswith('.mp4') or file_name.endswith('.avi') or file_name.endswith('.mov') ): if 'my_video' not in file_name: # 新的文件名,根据您的需要重命名 new_file_name = os.path.join(output_dir, f'my_video{begin}.mp4') # 替换成您想要的新文件名 os.rename(file_path, new_file_name) begin += 1 else: # 如果不是主要视频文件,可以删除 os.remove(file_path) return 1 except: print('该网址下载不了视频了!') return 0 begin = 1 i = 1 while i <= 43: url = f'https://www.bilibili.com/video/BV1XB4y1P7xd?p={i}&vd_source=c8d8d36b71cc82aaa71fffb815f6ed01' download_video_bilibili(url) i += 1
运行结果如图:
4,也可以直接在命令行终端运行如下命令:
you-get -o D:\English_video 'https://www.bilibili.com/video/BV1XB4y1P7xd/?p=41&spm_id_from=pageDriver&vd_source=c8d8d36b71cc82aaa71fffb815f6ed01'
其实就是 you-get -o 存储路径 'url'
5,同时,若还想爬取哔哔哩哔视频,也可以用这个网址:https://bili.iiilab.com/
访问如图:
6,播放下载的视频可以下载腾讯视频客户端,播放本地视频,如图: