从入门到入狱-----信息爬取以及图片下载（selenium、PyQuery、bs4）

最新推荐文章于 2021-08-27 21:35:46 发布

第壹大魔王

最新推荐文章于 2021-08-27 21:35:46 发布

阅读量446

点赞数 1

分类专栏：学习作业笔记文章标签： python 爬虫

本文链接：https://blog.csdn.net/weixin_44628421/article/details/109404960

版权

学习同时被 3 个专栏收录

42 篇文章 2 订阅

订阅专栏

笔记

35 篇文章 0 订阅

订阅专栏

作业

7 篇文章 0 订阅

订阅专栏

本来是想写一个爬虫类的，结果越写越偏，也不太想改了，所以这个四不像的类就将就着看吧，后面都是定义的函数，因为爬取的数据都是临时想到的，也就不好放在类里面（就是懒得改）

from selenium import webdriver
import requests
from pyquery import PyQuery
from bs4 import BeautifulSoup


class Spider:
    def req_(self, url: str,encodings='utf-8'):
        respons = requests.get(url, headers={
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"})
        respons.encoding=encodings
        if respons.status_code == 200:
            print("请求成功")
            print(respons.text)
            return respons.text
        else:
            print("请求失败")


s1 = Spider()

# 获取肖申克的救赎的电影信息
def movie_():
    respons = s1.req_('https://movie.douban.com/top250')
    pyq = PyQuery(respons)
    all_movies = pyq('.item>.pic>a')
    movie=all_movies.attr('href')
    respons=s1.req_(movie)
    pyq=PyQuery(respons)
    the_movie=pyq('#content')
    movie = PyQuery(the_movie)

    ranking=movie('.top250>.top250-no')
    name=movie('h1>span').text()


    data=movie('#info')
    a=movie('#info>span')
    list1=[PyQuery(i).text() for i in a]

    movie_message={
        '片名':name,'导演':list1[0][4:],'编剧':list1[1][4:],
        '主演':list1[2][4:],'类型':list1[4]+list1[5],
        '上映日期':list1[9]+list1[10],'片长':list1[12]
    }
    print('肖申克的救赎电影信息：')
    for i in movie_message:

        print(f'{i}:{movie_message[i]}')

s2=Spider()



# 下载英雄联盟的英雄图片
def lol_hero_pic():
    url='https://lol.qq.com/data/info-heros.shtml'
    b=webdriver.Chrome()
    b.get(url)
    result=b.page_source

    soup=BeautifulSoup(result,'lxml')
    hero_pic=soup.select('#jSearchHeroDiv>li>a>img')
    for i in hero_pic:
        d='https:'+i.attrs['src']
        name=i.attrs['alt'].replace(' ','-')
        results=requests.get(d).content
        with open(f'LOLpictures/{name}.png','wb')as f:
            f.write(results)

# 下载王者荣耀的英雄图片
def wzry_hero_pic():
    url = 'https://pvp.qq.com/web201605/herolist.shtml'
    b = webdriver.Chrome()
    b.get(url)
    result = b.page_source

    soup = BeautifulSoup(result, 'lxml')
    hero_pic = soup.select('.herolist.clearfix>li>a>img')

    for i in hero_pic:
        d = 'https:' + i.attrs['src']
        name = i.attrs['alt']
        results = requests.get(d).content
        with open(f'王者荣耀英雄图片/{name}.png', 'wb')as f:
            f.write(results)
            print(result)


wzry_hero_pic()