import requests
from bs4 import BeautifulSoup
from re import*import csv
headers ={'User-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ''AppleWebKit/537.36 (KHTML, like Gecko) ''Chrome/108.0.0.0 Safari/537.36'}defget_content(he):
html = requests.get(he, headers=headers).text
soup = BeautifulSoup(html,'lxml')
all_movies =[]
all_movie = soup.select('ol.grid_view>li')for movie in all_movie:
movies =[]
number = movie.select('.pic>em')[0].text
titles = movie.select('.title')
title =''for t in titles:
title += t.text.split('/')[-1].strip()
movies.append(title)
other = movie.select('.other')[0].text.split('/')for o in other:
movies.append(o.strip())
category = movie.select('.bd>p')[0].text
re_category = findall(r'\s+(.\d{4}.+)', category.split('\n')[-2])[0]for x in re_category.split('/'):
movies.append(x.strip())
score = movie.select('.star>span')[1].text
movies.append(score)
comment = movie.select('.star>span')[3].text
movies.append(comment[:-3])
slogan = movie.select('.quote>span')[0].text
movies.append(slogan)
all_movies.append(movies)return all_movies
defdownload_content(movie):
f =open('files/电影.csv','a', encoding='utf-8', newline='')
w = csv.writer(f)
w.writerow(['片名','其它','类别','评分','评论人数','标语'])for m in movie:
w.writerow(m)if __name__ =='__main__':
page =int(input('请输入需爬取的页数(1~10): '))for i inrange(1, page+1):
href ='https://movie.douban.com/top250?start='+str(i *25)+'&filter='
me = get_content(href)
download_content(me)
2
import requests
from bs4 import BeautifulSoup
import csv
from re import findall
defget_one_page(page):# 1. 发送请求获取网页数据
url =f'https://movie.douban.com/top250?start={page}&filter='
headers ={'user-agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'}
response = requests.get(url, headers=headers)
html = response.text
# 2. 解析数据
soup = BeautifulSoup(html,'lxml')# 获取所有电影对应div
all_film_box = soup.select('.item')# 遍历列表获取每个电影对应的divfor div in all_film_box:# 电影名称
name = div.select_one('.title').text
# 评分
score =float(div.select_one('.rating_num').text)# 评论人数# comment_num = div.select('.star>span')[-1].text
comment_num =int(div.select_one('.star>span:nth-child(4)').text[:-3])# 描述
describe_tag = div.select_one('.inq')if describe_tag:
describe = describe_tag.text
else:
describe =''
message = div.select_one('.bd>p').text
info = message.strip().split('\n')[-1].strip()# 字符串.strip() - 去掉字符串前后两端的空白字符
result =[x.strip()for x in info.split('/')]# 时间
time = result[0]# 国家
country = result[1]# 类型
film_type = result[-1]# 将数据写入到csv文件中
w.writerow([name, score, comment_num, time, country, film_type, describe])print('写入成功!')if __name__ =='__main__':# 1. 创建writer
w = csv.writer(open('files/电影.csv','w', encoding='utf-8', newline=''))
w.writerow(['电影名称','评分','评论人数','上映时间','国家','类型','描述'])# 2. 获取数据# get_one_page()for start inrange(0,226,25):
get_one_page(start)
3
from selenium.webdriver import Chrome
from selenium.webdriver.common.by import By
import time
import openpyxl
# 1.准备Excel文件,保存数据
wb = openpyxl.Workbook()
sheet = wb.create_sheet('豆瓣电影Top250')
sheet.cell(1,1).value ='电影名称'
sheet.cell(1,2).value ='评分'
row =2# 2.通过浏览器获取网页内容
b = Chrome()
b.get('https://movie.douban.com/top250')
time.sleep(1)for _ inrange(10):# 获取一页的所有电影的名字
all_title = b.find_elements(By.CSS_SELECTOR,'div.hd>a>span:nth-child(1)')
all_name =[]for x in all_title:
all_name.append(x.text)# 获取一页的所有的电影的评分
all_score_tag = b.find_elements(By.CLASS_NAME,'rating_num')
all_score =[]for x in all_score_tag:
all_score.append(x.text)# 将一页的名字和分数写入到Excel文件中for index inrange(len(all_name)):
sheet.cell(row,1).value = all_name[index]
sheet.cell(row,2).value = all_score[index]
row +=1# 点击下一页
b.find_element(By.CLASS_NAME,'next').click()
time.sleep(2)
wb.save('files/电影.xlsx')
英雄联盟皮肤图片下载
import requests
import os
defget_all_hero_id():
response = requests.get('https://game.gtimg.cn/images/lol/act/img/js/heroList/hero_list.js')
result = response.json()return[x['heroId']for x in result['hero']]defdownload(img_url, path):
response = requests.get(img_url)
result = response.content
withopen(path,'wb')as f:
f.write(result)print(f'{path}:下载完成!')defget_one_hero_skin(hero_id):
url =f'https://game.gtimg.cn/images/lol/act/img/js/hero/{hero_id}.js'
response = requests.get(url)
result = response.json()for x in result['skins']:
hero_name = x['heroName']
skin_name = x['name'].replace('/','')
img_url = x['mainImg']ifnot img_url:
img_url = x['chromaImg']ifnot os.path.exists(f'files/{hero_name}'):
os.mkdir(f'files/{hero_name}')
download(img_url,f'files/{hero_name}/{skin_name}.png')if __name__ =='__main__':for x in get_all_hero_id():
get_one_hero_skin(x)