1.发送请求,获取响应
# step1:请求地址
url = 'https://movie.douban.com/top250'
# step2:请求头
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
'cookie': 'xxxxxx'
}
User-Agent:在网页面前假装我是浏览器。例如豆瓣需要用
cookie:登录之后生成的,告诉网页我是用户。例如知乎需要用
# 获取响应
response = requests.get(url, headers=headers)
# 200:成功
if response.status_code == 200:
print(response.text)
else:
print('请求失败!', response)
2.正则表达式匹配电影名称
贪婪匹配:是指在整个正则表达式匹配成功的前提下,尽可能多的匹配。正则表达式默认的匹配方式是贪婪匹配,即尽可能多的匹配。
懒惰匹配/惰性匹配:是指在整个表达式匹配成功的前提下,尽可能少的匹配。
# 爬取电影名字和评分
import re
import time
import random
import bs4
import requests
# 获取电影标题的正则表达式
pattern = re.compile(r'<span class="title">(.+?)</span>')
for page in range(1): # 一页
resp = requests.get(
url=f'https://movie.douban.com/top250?start={page * 25}',
headers={
'User-Agent': 'http://piping.mogumiao.com/proxy/api/get_ip_bs?appKey=3ee6f035175f4b508d8a825da0fb3833&count=4&expiryDate=0&format=2&newLine=3'
}
)
print(resp.status_code)
if resp.status_code == 200:
soup = bs4.BeautifulSoup(resp.text, 'html.parser')
# nth-child(1)--- a标签的span里第一个孩子的元素
title_span_list = soup.select('div.info>div.hd>a>span:nth-child(1)')
rating_span_list = soup.select('div.info>div.bd>div>span.rating_num')
# 以txt的方式写入文件,格式: xx:xx
with open('movie_data.txt','w' ,encoding='utf-8_sig')as file:
for title_span, rating_span in zip(title_span_list, rating_span_list):
file.write(f'{title_span.text}:{rating_span.text}\n')
time.sleep(random.randint(3, 5))
3. 用CSV保存文件
CSV - Comma Seperate Value
with open('movie_data.csv','w', newline='')as file:
writer = csv.writer(file)
# 有一个参数。dilimiter,如果writer = csv.writer(file,dilimiter='#')
# 就表示分隔符是’#‘,而不是‘,’
writer.writerow(['标题','评分'])
for title_span, rating_span in zip(title_span_list, rating_span_list):
writer.writerow([title_span.text,rating_span.text])
_list):
writer.writerow([title_span.text,rating_span.text])