网络爬虫爬取常见网站数据

最新推荐文章于 2024-04-23 10:40:16 发布

青青啊

最新推荐文章于 2024-04-23 10:40:16 发布

阅读量8.7k

点赞数

文章标签：数据库 python

本文链接：https://blog.csdn.net/qq_38603408/article/details/105564207

版权

图片、视频、文字爬虫

xx网站gif图片爬取
百度图片爬取
小说信息爬取
视频爬取

xx网站gif图片爬取

# -*- coding: utf-8 -*-
#@Time    : 2020/4/7 15:17
#@Author  : Liu Qinghao
#@FileName: test.py
#@Software: PyCharm

# -*- coding:UTF-8 -*-
import re
import requests
from bs4 import BeautifulSoup
import urllib
import os
"""循环存取每一页每个标题下的所有gif图片"""

headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.92 Safari/537.36",
}
count = 1
urls = []
base_link = "https://2maoww.com"
# 创建文件夹存储图片
file = input('请建立一个存储图片的文件夹，输入文件夹名称即可')
y = os.path.exists(file)
if y == 1:
    print('该文件已存在，请重新输入')
    file = input('请建立一个存储图片的文件夹，)输入文件夹名称即可')
    os.mkdir(file)
else:
    os.mkdir(file)

#    首先获取所有页url
url='https://qiukk87.com/arttype/16.html'
response = requests.get(url,headers=headers,timeout=7)
html = response.text
soup = BeautifulSoup(html,'lxml')
links = soup.select('body > div:nth-child(2) > div.pagination > a:nth-child(9)')
page_all =int(links[0].contents[0])

# for i in range(page_all):
for i in range(3):
    if i ==0:
        urls.append(url)
    else:
        url = "https://qiukk87.com/arttype/16-" + str(i+1) + ".html"
        urls.append(url)

for url in urls:
    # 获取每页下所有标题的二级链接links
    response = requests.get(url,headers=headers,timeout=7)
    html = response.text
    soup = BeautifulSoup(html,'lxml')
    links_2 = soup.select('body > div > table > tbody > tr > td > a')
    # 循环识别所有链接下的所有三级标题链接
    # for link in links[0:1]:
    for link in links_2[0:3]:
        tmplink = base_link + link.attrs['href']
        response_link = requests.get(tmplink,headers=headers,timeout=7)
        htmltmp = response_link.text
        soup = BeautifulSoup(htmltmp,'lxml')
        piclinks = soup.select("body > div > div > div > p > img")
        # names = soup.select("body > div > div > div > p.text-center.noveltext")
        opener = urllib.request.build_opener()
        opener.addheaders = [('User-Agent',
                              'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1941.0 Safari/537.36')]
        urllib.request.install_opener(opener)
        for piclink in piclinks[0:5]:
        # for piclink in piclinks:
            link_3 = piclink.attrs['data-original']
        #gif下载
            urllib.request.urlretrieve(link_3, filename=file + r'\\' + link_3.split("/")[-1])
            # urllib.request.urlretrieve(link_2, filename= file + r'\\' +names[0].text + link_2.split("/")[-1])
            print("第",count,"张gif图片已经下载完成。")
            count +=1

百度图片爬取

# -*- coding: utf-8 -*-
#@Time    : 2020/4/13 22:03
#@Author  : Liu Qinghao
#@FileName: baiduimage.py
#@Software: PyCharm

import re
import requests
from urllib import error
from bs4 import BeautifulSoup
import os

num = 0
numPicture = 0
file = ''
List = []


def Find(url):
    global List
    print('正在检测图片总数，请稍等.....')
    t = 0
    i = 1
    s = 0
    while t < 1000:
        Url = url + str(t)
        try:
            Result = requests.get(Url, timeout=7)
        except BaseException:
            t = t + 60
            print("---------------------")
            continue
        else:
            result = Result.text
            pic_url = re.findall('"objURL":"(.*?)",', result, re.S)  # 先利用正则表达式找到图片url
            s += len(pic_url)
            if len(pic_url) == 0:
                break
            else:
                List.append(pic_url)
                t = t + 60
    return s


def recommend(url):
    Re = []
    try:
        html = requests.get(url)
    except error.HTTPError as e:
        return
    else:
        html.encoding = 'utf-8'
        bsObj = BeautifulSoup(html.text, 'html.parser')
        div = bsObj.find('div', id='topRS')
        if div is not None:
            listA = div.findAll('a')
            for i in listA:
                if i is not None:
                    Re.append(i.get_text())
        return Re


def dowmloadPicture(html, keyword):
    global num
    # t =0
    pic_url = re.findall('"objURL":"(.*?)",', html, re.S)  # 先利用正则表达式找到图片url
    print('找到关键词:' + keyword + '的图片，即将开始下载图片...')
    for each in pic_url:
        print('正在下载第' + str(num + 1) + '张图片，图片地址:' + str(each))
        try:
            if each is not None:
                pic = requests.get(each, timeout=7)
            else:
                continue
        except BaseException:
            print('错误，当前图片无法下载')
            continue
        else:
            string = file + r'\\' + keyword + '_' + str(num) + '.jpg'
            fp = open(string, 'wb')
            fp.write(pic.content)
            fp.close()
            num += 1
        if num >= numPicture:
            return


if __name__ == '__main__':  # 主函数入口
    word = input("请输入搜索关键词(可以是人名，地名等): ")
    # add = 'http://image.baidu.com
    url = 'http://image.baidu.com/search/flip?tn=baiduimage&ie=utf-8&word=' + word + '&pn='
    tot = Find(url)
    Recommend = recommend(url)  # 记录相关推荐
    print('经过检测%s类图片共有%d张' % (word, tot))
    numPicture = int(input('请输入想要下载的图片数量 '))
    file = input('请建立一个存储图片的文件夹，输入文件夹名称即可')
    y = os.path.exists(file)
    if y == 1:
        print('该文件已存在，请重新输入')
        file = input('请建立一个存储图片的文件夹，)输入文件夹名称即可')
        os.mkdir(file)
    else:
        os.mkdir(file)
    t = 0
    tmp = url
    while t < numPicture:
        try:
            url = tmp + str(t)
            result = requests.get(url, timeout=10)
            print(url)
        except error.HTTPError as e:
            print('网络错误，请调整网络后重试')
            t = t + 60
        else:
            dowmloadPicture(result.text, word)
            t = t + 60

    print('当前搜索结束，感谢使用')
    print('猜你喜欢')
    for re in Recommend:
        print(re, end='  ')

小说信息爬取

import requests
import re
import json

def request_dandan(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            return response.text
    except requests.RequestException:
        return None


def parse_result(html):
    pattern = re.compile('<li>.*?list_num.*?(\d+).</div>.*?<img src="(.*?)".*?class="name".*?title="(.*?)">.*?class="star">.*?class="tuijian">(.*?)</span>.*?class="publisher_info">.*?target="_blank">(.*?)</a>.*?class="biaosheng">.*?<span>(.*?)</span></div>.*?<p><span\sclass="price_n">&yen;(.*?)</span>.*?</li>',re.S)
    items = re.findall(pattern,html)
    for item in items:
        yield {
            'range': item[0],
            'iamge': item[1],
            'title': item[2],
            'recommend': item[3],
            'author': item[4],
            'times': item[5],
            'price': item[6]
        }


def write_item_to_file(item):
    print('开始写入数据 ====> ' + str(item))
    with open('book.txt', 'a', encoding='UTF-8') as f:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')
        f.close()


def main(page):
    url = 'http://bang.dangdang.com/books/fivestars/01.00.00.00.00.00-recent30-0-0-1-' + str(page)
    html = request_dandan(url)
    items = parse_result(html) # 解析过滤我们想要的信息
    for item in items:
        write_item_to_file(item)


if __name__ == "__main__":
    for i in range(1,26):
        main(i)

视频爬取

# -*- coding: utf-8 -*-
#@Time    : 2020/4/7 15:17
#@Author  : Liu Qinghao
#@FileName: test.py
#@Software: PyCharm

import string
import requests
from bs4 import BeautifulSoup
import urllib
import os
"""循环存取每一页每个标题下的所有视频"""

headers = {
    "user-agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36",
}
count = 1
urls = []#视频url地址
title_all = []#视频标题
xunlei_addre_all = []#视频下载地址
tot = 0 #视频查找总数
page_all = 0#网站总页数
vedio_type_1 = []#视频一级分类
vedio_type_2 = []#视频二级分类
i=0

def vedio_type(base_link):
    response = requests.get(base_link, headers=headers, timeout=10000)
    response.encoding = response.apparent_encoding
    html = response.text
    soup = BeautifulSoup(html,'lxml')
    links = soup.select('#nav-dianshiju > a')
    for i in range(len(links)):
        type= str(links[i].contents[0])
        vedio_type_1.append(type)
        i+=1
    print(vedio_type_1)
    type_wanna = int(input("想看什么类型的小黄片，请输入标签的位置：" ))
    if type_wanna==1:
        url='https://www.5456ye.com/vod/html1/'
    elif type_wanna==2:
        url = 'https://www.5456ye.com/vod/html9/'
    elif type_wanna==3:
        url = ' https://www.5456ye.com/vod/html16/'
    else:
        print('sorry,脚本暂不支持该类视频!')
    response = requests.get(url, headers=headers, timeout=10000)
    response.encoding = response.apparent_encoding
    html = response.text
    soup = BeautifulSoup(html, 'lxml')
    links_2 = soup.select('body > div > div > div > div > div > dl > dd')
    for i in range(int((len(links_2[0].contents)-1)/2)):
        type = links_2[0].contents[2*i+1].contents[0]
        type = str(type)
        vedio_type_2.append(type)
        i += 1
    print(vedio_type_2)
    type_end = int(input("想看  "+vedio_type_1[type_wanna-1]+"  下什么类型的小黄片，请输入标签的位置："))
    url = base_link+links_2[0].contents[2*type_end-1]['href']
    return url


    page_all =int(href[href.find('index_')+6:href.find('.html')])
    tot = page_all*24
# nav-dianshiju > a


def vedio_num(url):
    print('正在检测视频总数，请稍等.....')
    global page_all
    global tot
#    首先获取所有页url
    try:
        response = requests.get(url,headers=headers,timeout=10000)
    except requests.exceptions.ConnectionError:
        response.status_code = "Connection refused"
    response.encoding = response.apparent_encoding
    html = response.text
    soup = BeautifulSoup(html,'lxml')
    links = soup.select('#long-page > ul > li:nth-child(15) > a')
    href = links[0]['href']
    page_all =int(href[href.find('index_')+6:href.find('.html')])
    tot = page_all*24
    print('经过检测该类视频共有%d个' %tot)
    return page_all, tot

def Find(url):
    print('正在爬取所有视频的下载链接，请稍等......')
    # for i in range(page_all):
    for i in range(10):
        if i ==0:
            # url = "https://www.2456ne.com/vod/html4/index.html"
            urls.append(url)
        else:
            url = "https://www.2456ne.com/vod/html4/index_" + str(i+1) + ".html"
            urls.append(url)

    for url in urls:
        # 获取每页下所有标题的二级链接links
        response = requests.get(url,headers=headers,timeout=10000)
        html = response.text
        soup = BeautifulSoup(html,'lxml')
        links_2 = soup.select('#content > li > a')

        # 循环识别所有链接下的所有三级标题链接
        # for link in links_2:
        for link in links_2[0:5]:
            tmplink = base_link + link.attrs['href']
            response_link = requests.get(tmplink,headers=headers,timeout=10000)
            response_link.encoding = response_link.apparent_encoding
            htmltmp = response_link.text
            soup = BeautifulSoup(htmltmp,'lxml')
            addre_play = soup.find_all('ul',class_='playul')[0]
            addre_download = soup.find_all('ul', class_='playul')[1]
            # 识别视频标题
            title =soup.select('#detail-box')[0].contents[1]
            title = title.find('img')['alt']
            # 识别视频单一地址
            addre = addre_download.find('a')['href']
            url_down = "https://www.2456ne.com/" + addre
            response = requests.get(url_down, headers=headers, timeout=10000)
            response.encoding = response.apparent_encoding
            html_addr = response.text
            soup = BeautifulSoup(html_addr, 'lxml')
            # 保存最终三级网址
            xunlei_addre = soup.find_all('div',class_='download')[0].contents[1]['href']
            xunlei_addre_all.append(xunlei_addre)
            title_all.append(title)
    return xunlei_addre_all,title_all,urls

def urlsave(file):
# 写入下载链接
    filename = file + "/下载链接.txt"
    video_sourse = open(filename,'w',encoding='utf-8')
    for i in range(len(xunlei_addre_all)):
        video_sourse.write(str(i)+'.'+title_all[i])
        video_sourse.write('\n')
        video_sourse.write(xunlei_addre_all[i])
        video_sourse.write('\n')
    video_sourse.write("Collected by LQH,You're welcome!")
    video_sourse.close()
    return video_sourse

def vedio_download(xunlei_addre_all,title_all):
    # 写入视频文件
    global count
    for address,title in zip(xunlei_addre_all,title_all):
        while num_wanna>count-1:
            try:
                if address is not None:
                    video = requests.get(address, timeout=10000)
                else:
                    continue
            except BaseException:
                print('错误，当前视频无法下载')
                continue
            else:
                opener = urllib.request.build_opener()
                opener.addheaders = [('User-Agent',
                                      'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.106 Safari/537.36')]
                urllib.request.install_opener(opener)

                print("开始下载第", count, "个视频！")
                r = requests.get(address,headers=headers,stream=True)
                string = file + "/"  + title + '.mp4'
                with open(string, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=1024 * 1024):
                        if chunk:
                            f.write(chunk)
                print("第", count, "个视频已经下载完成!")
                count += 1
    return

if __name__ == '__main__':
    base_link = "https://www.5123wo.com/"
    # url = 'https://www.2456ne.com/vod/html4/index.html'
    url = vedio_type(base_link)
    page_all, tot=vedio_num(url)
    xunlei_addre_all, title_all, urls = Find(url)
    # 创建文件夹存储视频
    file = input('请建立一个存储视频的文件夹，输入文件夹名称即可')
    y = os.path.exists(file)
    if y == 1:
        print('该文件已存在，请重新输入')
        file = input('请建立一个存储视频的文件夹，输入文件夹名称即可')
        os.mkdir(file)
    else:
        os.mkdir(file)
    video_sourse = urlsave(file)
    num_wanna = int(input("您想下载多少个视频,请输入小于"+str(tot)+"的正整数"))
    if num_wanna>tot:
        print('视频下载需求过多，请重新决定')
    else:
        vedio_download(xunlei_addre_all, title_all)

青青啊

关注

0
点赞
踩
10

收藏

觉得还不错? 一键收藏
0
评论
网络爬虫爬取常见网站数据

图片、视频、文字爬虫xx网站gif图片爬取百度图片爬取小说信息爬取视频爬取xx网站gif图片爬取# -*- coding: utf-8 -*-#@Time : 2020/4/7 15:17#@Author : Liu Qinghao#@FileName: test.py#@Software: PyCharm# -*- coding:UTF-8 -*-import rei...
复制链接

扫一扫