# -*- coding: utf-8 -*-
#@Time : 2020/4/7 15:17
#@Author : Liu Qinghao
#@FileName: test.py
#@Software: PyCharm
# -*- coding:UTF-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import urllib
import os
"""循环存取每一页每个标题下的所有gif图片"""
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
}
count = 1
urls = []
base_link = "https://2maoww.com"
# 创建文件夹存储图片
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可')
y = os.path.exists(file)
if y == 1:
print('该文件已存在,请重新输入')
file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可')
os.mkdir(file)
else:
os.mkdir(file)
# 首先获取所有页url
url='https://qiukk87.com/arttype/16.html'
response = requests.get(url,headers=headers,timeout=7)
html = response.text
soup = BeautifulSoup(html,'lxml')
links = soup.select('body > div:nth-child(2) > div.pagination > a:nth-child(9)')
page_all =int(links[0].contents[0])
# for i in range(page_all):
for i in range(3):
if i ==0:
urls.append(url)
else:
url = "https://qiukk87.com/arttype/16-" + str(i+1) + ".html"
urls.append(url)
for url in urls:
# 获取每页下所有标题的二级链接links
response = requests.get(url,headers=headers,timeout=7)
html = response.text
soup = BeautifulSoup(html,'lxml')
links_2 = soup.select('body > div > table > tbody > tr > td > a')
# 循环识别所有链接下的所有三级标题链接
# for link in links[0:1]:
for link in links_2[0:3]:
tmplink = base_link + link.attrs['href']
response_link = requests.get(tmplink,headers=headers,timeout=7)
htmltmp = response_link.text
soup = BeautifulSoup(htmltmp,'lxml')
piclinks = soup.select("body > div > div > div > p > img")
# names = soup.select("body > div > div > div > p.text-center.noveltext")
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
urllib.request.install_opener(opener)
for piclink in piclinks[0:5]:
# for piclink in piclinks:
link_3 = piclink.attrs['data-original']
#gif下载
urllib.request.urlretrieve(link_3, filename=file + r'\\' + link_3.split("/")[-1])
# urllib.request.urlretrieve(link_2, filename= file + r'\\' +names[0].text + link_2.split("/")[-1])
print("第",count,"张gif图片已经下载完成。")
count +=1
百度图片爬取
# -*- coding: utf-8 -*-
#@Time : 2020/4/13 22:03
#@Author : Liu Qinghao
#@FileName: baiduimage.py
#@Software: PyCharm
import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import os
num = 0
numPicture = 0
file = ''
List = []
def Find(url):
global List
print('正在检测图片总数,请稍等.....')
t = 0
i = 1
s = 0
while t < 1000:
Url = url + str(t)
try:
Result = requests.get(Url, timeout=7)
except BaseException:
t = t + 60
print("---------------------")
continue
else:
result = Result.text
pic_url = re.findall('"objURL":"(.*?)",', result, re.S) # 先利用正则表达式找到图片url
s += len(pic_url)
if len(pic_url) == 0:
break
else:
List.append(pic_url)
t = t + 60
return s
def recommend(url):
Re = []
try:
html = requests.get(url)
except error.HTTPError as e:
return
else:
html.encoding = 'utf-8'
bsObj = BeautifulSoup(html.text, 'html.parser')
div = bsObj.find('div', id='topRS')
if div is not None:
listA = div.findAll('a')
for i in listA:
if i is not None:
Re.append(i.get_text())
return Re
def dowmloadPicture(html, keyword):
global num
# t =0
pic_url = re.findall('"objURL":"(.*?)",', html, re.S) # 先利用正则表达式找到图片url
print('找到关键词:' + keyword + '的图片,即将开始下载图片...')
for each in pic_url:
print('正在下载第' + str(num + 1) + '张图片,图片地址:' + str(each))
try:
if each is not None:
pic = requests.get(each, timeout=7)
else:
continue
except BaseException:
print('错误,当前图片无法下载')
continue
else:
string = file + r'\\' + keyword + '_' + str(num) + '.jpg'
fp = open(string, 'wb')
fp.write(pic.content)
fp.close()
num += 1
if num >= numPicture:
return
if __name__ == '__main__': # 主函数入口
word = input("请输入搜索关键词(可以是人名,地名等): ")
# add = 'http://image.baidu.com
url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
tot = Find(url)
Recommend = recommend(url) # 记录相关推荐
print('经过检测%s类图片共有%d张' % (word, tot))
numPicture = int(input('请输入想要下载的图片数量 '))
file = input('请建立一个存储图片的文件夹,输入文件夹名称即可')
y = os.path.exists(file)
if y == 1:
print('该文件已存在,请重新输入')
file = input('请建立一个存储图片的文件夹,)输入文件夹名称即可')
os.mkdir(file)
else:
os.mkdir(file)
t = 0
tmp = url
while t < numPicture:
try:
url = tmp + str(t)
result = requests.get(url, timeout=10)
print(url)
except error.HTTPError as e:
print('网络错误,请调整网络后重试')
t = t + 60
else:
dowmloadPicture(result.text, word)
t = t + 60
print('当前搜索结束,感谢使用')
print('猜你喜欢')
for re in Recommend:
print(re, end=' ')
小说信息爬取
import requests
import re
import json
def request_dandan(url):
try:
response = requests.get(url)
if response.status_code == 200:
return response.text
except requests.RequestException:
return None
def parse_result(html):
pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">¥(.*?)</span>.*?</li>',re.S)
items = re.findall(pattern,html)
for item in items:
yield {
'range': item[0],
'iamge': item[1],
'title': item[2],
'recommend': item[3],
'author': item[4],
'times': item[5],
'price': item[6]
}
def write_item_to_file(item):
print('开始写入数据 ====> ' + str(item))
with open('book.txt', 'a', encoding='UTF-8') as f:
f.write(json.dumps(item, ensure_ascii=False) + '\n')
f.close()
def main(page):
url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
html = request_dandan(url)
items = parse_result(html) # 解析过滤我们想要的信息
for item in items:
write_item_to_file(item)
if __name__ == "__main__":
for i in range(1,26):
main(i)
视频爬取
# -*- coding: utf-8 -*-
#@Time : 2020/4/7 15:17
#@Author : Liu Qinghao
#@FileName: test.py
#@Software: PyCharm
import string
import requests
from bs4 import BeautifulSoup
import urllib
import os
"""循环存取每一页每个标题下的所有视频"""
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36",
}
count = 1
urls = []#视频url地址
title_all = []#视频标题
xunlei_addre_all = []#视频下载地址
tot = 0 #视频查找总数
page_all = 0#网站总页数
vedio_type_1 = []#视频一级分类
vedio_type_2 = []#视频二级分类
i=0
def vedio_type(base_link):
response = requests.get(base_link, headers=headers, timeout=10000)
response.encoding = response.apparent_encoding
html = response.text
soup = BeautifulSoup(html,'lxml')
links = soup.select('#nav-dianshiju > a')
for i in range(len(links)):
type= str(links[i].contents[0])
vedio_type_1.append(type)
i+=1
print(vedio_type_1)
type_wanna = int(input("想看什么类型的小黄片,请输入标签的位置:" ))
if type_wanna==1:
url='https://www.5456ye.com/vod/html1/'
elif type_wanna==2:
url = 'https://www.5456ye.com/vod/html9/'
elif type_wanna==3:
url = ' https://www.5456ye.com/vod/html16/'
else:
print('sorry,脚本暂不支持该类视频!')
response = requests.get(url, headers=headers, timeout=10000)
response.encoding = response.apparent_encoding
html = response.text
soup = BeautifulSoup(html, 'lxml')
links_2 = soup.select('body > div > div > div > div > div > dl > dd')
for i in range(int((len(links_2[0].contents)-1)/2)):
type = links_2[0].contents[2*i+1].contents[0]
type = str(type)
vedio_type_2.append(type)
i += 1
print(vedio_type_2)
type_end = int(input("想看 "+vedio_type_1[type_wanna-1]+" 下什么类型的小黄片,请输入标签的位置:"))
url = base_link+links_2[0].contents[2*type_end-1]['href']
return url
page_all =int(href[href.find('index_')+6:href.find('.html')])
tot = page_all*24
# nav-dianshiju > a
def vedio_num(url):
print('正在检测视频总数,请稍等.....')
global page_all
global tot
# 首先获取所有页url
try:
response = requests.get(url,headers=headers,timeout=10000)
except requests.exceptions.ConnectionError:
response.status_code = "Connection refused"
response.encoding = response.apparent_encoding
html = response.text
soup = BeautifulSoup(html,'lxml')
links = soup.select('#long-page > ul > li:nth-child(15) > a')
href = links[0]['href']
page_all =int(href[href.find('index_')+6:href.find('.html')])
tot = page_all*24
print('经过检测该类视频共有%d个' %tot)
return page_all, tot
def Find(url):
print('正在爬取所有视频的下载链接,请稍等......')
# for i in range(page_all):
for i in range(10):
if i ==0:
# url = "https://www.2456ne.com/vod/html4/index.html"
urls.append(url)
else:
url = "https://www.2456ne.com/vod/html4/index_" + str(i+1) + ".html"
urls.append(url)
for url in urls:
# 获取每页下所有标题的二级链接links
response = requests.get(url,headers=headers,timeout=10000)
html = response.text
soup = BeautifulSoup(html,'lxml')
links_2 = soup.select('#content > li > a')
# 循环识别所有链接下的所有三级标题链接
# for link in links_2:
for link in links_2[0:5]:
tmplink = base_link + link.attrs['href']
response_link = requests.get(tmplink,headers=headers,timeout=10000)
response_link.encoding = response_link.apparent_encoding
htmltmp = response_link.text
soup = BeautifulSoup(htmltmp,'lxml')
addre_play = soup.find_all('ul',class_='playul')[0]
addre_download = soup.find_all('ul', class_='playul')[1]
# 识别视频标题
title =soup.select('#detail-box')[0].contents[1]
title = title.find('img')['alt']
# 识别视频单一地址
addre = addre_download.find('a')['href']
url_down = "https://www.2456ne.com/" + addre
response = requests.get(url_down, headers=headers, timeout=10000)
response.encoding = response.apparent_encoding
html_addr = response.text
soup = BeautifulSoup(html_addr, 'lxml')
# 保存最终三级网址
xunlei_addre = soup.find_all('div',class_='download')[0].contents[1]['href']
xunlei_addre_all.append(xunlei_addre)
title_all.append(title)
return xunlei_addre_all,title_all,urls
def urlsave(file):
# 写入下载链接
filename = file + "/下载链接.txt"
video_sourse = open(filename,'w',encoding='utf-8')
for i in range(len(xunlei_addre_all)):
video_sourse.write(str(i)+'.'+title_all[i])
video_sourse.write('\n')
video_sourse.write(xunlei_addre_all[i])
video_sourse.write('\n')
video_sourse.write("Collected by LQH,You're welcome!")
video_sourse.close()
return video_sourse
def vedio_download(xunlei_addre_all,title_all):
# 写入视频文件
global count
for address,title in zip(xunlei_addre_all,title_all):
while num_wanna>count-1:
try:
if address is not None:
video = requests.get(address, timeout=10000)
else:
continue
except BaseException:
print('错误,当前视频无法下载')
continue
else:
opener = urllib.request.build_opener()
opener.addheaders = [('User-Agent',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36')]
urllib.request.install_opener(opener)
print("开始下载第", count, "个视频!")
r = requests.get(address,headers=headers,stream=True)
string = file + "/" + title + '.mp4'
with open(string, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
print("第", count, "个视频已经下载完成!")
count += 1
return
if __name__ == '__main__':
base_link = "https://www.5123wo.com/"
# url = 'https://www.2456ne.com/vod/html4/index.html'
url = vedio_type(base_link)
page_all, tot=vedio_num(url)
xunlei_addre_all, title_all, urls = Find(url)
# 创建文件夹存储视频
file = input('请建立一个存储视频的文件夹,输入文件夹名称即可')
y = os.path.exists(file)
if y == 1:
print('该文件已存在,请重新输入')
file = input('请建立一个存储视频的文件夹,输入文件夹名称即可')
os.mkdir(file)
else:
os.mkdir(file)
video_sourse = urlsave(file)
num_wanna = int(input("您想下载多少个视频,请输入小于"+str(tot)+"的正整数"))
if num_wanna>tot:
print('视频下载需求过多,请重新决定')
else:
vedio_download(xunlei_addre_all, title_all)