本来是想写一个爬虫类的,结果越写越偏,也不太想改了,所以这个四不像的类就将就着看吧,后面都是定义的函数,因为爬取的数据都是临时想到的,也就不好放在类里面(就是懒得改)
from selenium import webdriver
import requests
from pyquery import PyQuery
from bs4 import BeautifulSoup
class Spider:
def req_(self, url: str,encodings='utf-8'):
respons = requests.get(url, headers={
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.111 Safari/537.36"})
respons.encoding=encodings
if respons.status_code == 200:
print("请求成功")
print(respons.text)
return respons.text
else:
print("请求失败")
s1 = Spider()
# 获取肖申克的救赎的电影信息
def movie_():
respons = s1.req_('https://movie.douban.com/top250')
pyq = PyQuery(respons)
all_movies = pyq('.item>.pic>a')
movie=all_movies.attr('href')
respons=s1.req_(movie)
pyq=PyQuery(respons)
the_movie=pyq('#content')
movie = PyQuery(the_movie)
ranking=movie('.top250>.top250-no')
name=movie('h1>span').text()
data=movie('#info')
a=movie('#info>span')
list1=[PyQuery(i).text() for i in a]
movie_message={
'片名':name,'导演':list1[0][4:],'编剧':list1[1][4:],
'主演':list1[2][4:],'类型':list1[4]+list1[5],
'上映日期':list1[9]+list1[10],'片长':list1[12]
}
print('肖申克的救赎电影信息:')
for i in movie_message:
print(f'{i}:{movie_message[i]}')
s2=Spider()
# 下载英雄联盟的英雄图片
def lol_hero_pic():
url='https://lol.qq.com/data/info-heros.shtml'
b=webdriver.Chrome()
b.get(url)
result=b.page_source
soup=BeautifulSoup(result,'lxml')
hero_pic=soup.select('#jSearchHeroDiv>li>a>img')
for i in hero_pic:
d='https:'+i.attrs['src']
name=i.attrs['alt'].replace(' ','-')
results=requests.get(d).content
with open(f'LOLpictures/{name}.png','wb')as f:
f.write(results)
# 下载王者荣耀的英雄图片
def wzry_hero_pic():
url = 'https://pvp.qq.com/web201605/herolist.shtml'
b = webdriver.Chrome()
b.get(url)
result = b.page_source
soup = BeautifulSoup(result, 'lxml')
hero_pic = soup.select('.herolist.clearfix>li>a>img')
for i in hero_pic:
d = 'https:' + i.attrs['src']
name = i.attrs['alt']
results = requests.get(d).content
with open(f'王者荣耀英雄图片/{name}.png', 'wb')as f:
f.write(results)
print(result)
wzry_hero_pic()