爬虫
安装requests模块:pip install requests
1.文字爬虫
import re # 导入re模块
import requests # 导入request模块
response = requests.get('https://ishuo.cn/') # 获取网页内容源代码
data = response.text # 将网页内容源代码存放在文本格式记录data文件中
result_list = re.findall('<div class="content">(.*?)</div>',data) # 找到要爬取的内容(.*?)前后的共同点写入列表
'''
<div class="content">与人a1人间的信任,就像是纸片,一旦破损,就不会再回到原来的样子。</div>
<div class="content">(.*?)</div>
<div class="content">一年奔波,尘缘遇了谁;一句珍重,天涯别了谁;一点灵犀,凭栏忆了.</div>
<div class="content">(.*?)</div>
'''
for result in result_list:
print(result) # 分别读取
2.图片爬虫
import re
import requests
count= 0
for i in range(2,4):
# 'http://pic.netbian.com/index_{i}.html'跳转下一页继续爬
respone = requests.get(f'http://pic.netbian.com/index_{i}.html') #获取网址源代码内容
data = respone.text #文本形式记录
# print(data)
result_list = re.findall('src="(.*?)"',data) #从data中获取所要爬的共同的东西(图片)
for result in result_list:
if result.endswith('jpg'): # 判断挑选需要的图片的格式
result = f'http://pic.netbian.com{result}' # 图片的网址
# print(result)
img_respone = requests.get(result) # 获取图片的内容(二进制记录的内容)
img_name = result.split('/')[-1] # 每次循环分别记录图片名称
img_data = img_respone.content # 以字符形式记录图片
# print(img_data)
with open(img_name,'wb') as f: # 创建名字为img_name的图片并打开
f.write(img_data) # 以字符形式写入图片
f.flush()
count+=1
print(f'爬取了{count}张图片')
3.视频爬虫
import re
import requests
response = requests.get('https://www.ku6.com/index') # 获取网址代码
data = response.text # 文本形式记录网页代码
count = 0
# print(data)
result_list = re.findall('<a class="video-image-warp" target="_blank" href="(.*?)">',data) # 找到视频网址的共同地方
for result in result_list:
# print(result)
if result.startswith('/video'):
# print(result)
result = f'https://www.ku6.com{result}' # 补满视频网址
# print(result)
detail_response = requests.get(result) # 获取单个视频网址代码
detail_data = detail_response.text # 文本形式记录单个视频网页代码
# src="https://rbv01.ku6.com/wifi/o_1dab1luo5oao1jnk1bpnk321hevckvs" >< / video >(????)
# flvURL: "https://rbv01.ku6.com/wifi/o_1dab1luo5oao1jnk1bpnk321hevckvs"
# flvURL: "https://rbv01.ku6.com/wifi/o_1dab1luo5udcici1r2vefj1jksbkvs"
video_url = re.findall('flvURL: "(.*?)"',detail_data) # 找到单个视频网址代码
# print(video_url)
video_response = requests.get(video_url[0]) # video_url是包含一个元素的列表
video_data = video_response.content # 以二进制保存视频
video_name = f"{video_url[0].split('/')[-1]}.mp4" # 编辑视频名字
with open(video_name, 'wb') as fw: # 打开文件
fw.write(video_data) # 写入视频
fw.flush()
count += 1
print(f'爬取了{count}个视频')