从入门到入狱-----信息爬取以及图片下载(selenium、PyQuery、bs4)

35 篇文章 0 订阅
7 篇文章 0 订阅

本来是想写一个爬虫类的,结果越写越偏,也不太想改了,所以这个四不像的类就将就着看吧,后面都是定义的函数,因为爬取的数据都是临时想到的,也就不好放在类里面(就是懒得改)

from selenium import webdriver
import requests
from pyquery import PyQuery
from bs4 import BeautifulSoup


class Spider:
    def req_(self, url: str,encodings='utf-8'):
        respons = requests.get(url, headers={
            'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"})
        respons.encoding=encodings
        if respons.status_code == 200:
            print("请求成功")
            print(respons.text)
            return respons.text
        else:
            print("请求失败")


s1 = Spider()

# 获取肖申克的救赎的电影信息
def movie_():
    respons = s1.req_('https://movie.douban.com/top250')
    pyq = PyQuery(respons)
    all_movies = pyq('.item>.pic>a')
    movie=all_movies.attr('href')
    respons=s1.req_(movie)
    pyq=PyQuery(respons)
    the_movie=pyq('#content')
    movie = PyQuery(the_movie)

    ranking=movie('.top250>.top250-no')
    name=movie('h1>span').text()


    data=movie('#info')
    a=movie('#info>span')
    list1=[PyQuery(i).text() for i in a]

    movie_message={
        '片名':name,'导演':list1[0][4:],'编剧':list1[1][4:],
        '主演':list1[2][4:],'类型':list1[4]+list1[5],
        '上映日期':list1[9]+list1[10],'片长':list1[12]
    }
    print('肖申克的救赎电影信息:')
    for i in movie_message:

        print(f'{i}:{movie_message[i]}')

s2=Spider()



# 下载英雄联盟的英雄图片
def lol_hero_pic():
    url='https://lol.qq.com/data/info-heros.shtml'
    b=webdriver.Chrome()
    b.get(url)
    result=b.page_source

    soup=BeautifulSoup(result,'lxml')
    hero_pic=soup.select('#jSearchHeroDiv>li>a>img')
    for i in hero_pic:
        d='https:'+i.attrs['src']
        name=i.attrs['alt'].replace(' ','-')
        results=requests.get(d).content
        with open(f'LOLpictures/{name}.png','wb')as f:
            f.write(results)

# 下载王者荣耀的英雄图片
def wzry_hero_pic():
    url = 'https://pvp.qq.com/web201605/herolist.shtml'
    b = webdriver.Chrome()
    b.get(url)
    result = b.page_source

    soup = BeautifulSoup(result, 'lxml')
    hero_pic = soup.select('.herolist.clearfix>li>a>img')

    for i in hero_pic:
        d = 'https:' + i.attrs['src']
        name = i.attrs['alt']
        results = requests.get(d).content
        with open(f'王者荣耀英雄图片/{name}.png', 'wb')as f:
            f.write(results)
            print(result)


wzry_hero_pic()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值