Day2 爬虫
文章目录
1. 图片下载
import requests
def download_image(url:str):
# 1.请求网页
response = requests.get(url)
# 2.获取图片数据
data = response.content
# 3.将数据写入图片文件中
with open('懒洋洋.jpeg', 'wb') as f:
f.write(data)
if __name__ == '__main__':
download_image('https://cn.bing.com/search?q=%E6%87%92%E6%B4%8B%E6%B4%8B%E5%9B%BE%E7%89%87&form=ANNTH1&refig=7ab020bb977b49d59ac43bb4bd63b09b&sp=2&qs=SC&pq=%E6%87%92%E6%B4%8B%E6%B4%8B&sk=SC1&sc=10-3&cvid=7ab020bb977b49d59ac43bb4bd63b09b')
2. 网站批量下载图片
import requests
from re import findall
from uuid import uuid1 # 可以创建唯一的一个id值
def download_image(url: str):
response = requests.get(url)
with open(f'{uuid1()}.jpeg', 'wb') as f:
f.write(response.content)
# 1.请求网页
response = requests.get('http://cd.zu.ke.com/zufang')
content = response.text
# print(content)
# 2.解析数据获取所有房屋的图片地址
all_images = findall(r'(?s)<a\s+class="content__list--item--aside".+?>\s+<img.+/data-src="(.+?)"', content)
# 3.下载所有图片:
for i in all_images:
download_image(i)
3. 浏览器伪装
import requests
Headers = {
'User_Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47'
}
response = requests.get('https://movie.douban.com/top250', headers=Headers)
print(response.status_code)
print(response.text)
4. bs4数据解析
4.1 bs4的作用
专门用来解析网页数据的第三方库。(基于css选择器解析网页数据)
这个库下载的时候用’beautifulsoup4’, 使用的时候用’bs4’
注意:在使用bs4做数据解析的时候,需要依赖’lxml’这个第三方库
导入解析相关类
from bs4 import BeautifulSoup
4.2 bs4的用法
1)准备需要解析的数据(获取网页数据)
html = open('text.html', encoding='utf-8').read()
2)基于网页源代码,创建BeautifulSoup对象
soup对象代表网页对于的HTML标签(代表整个网页)
soup = BeautifulSoup(html, 'lxml')
3)获取标签
soup.select(css选择器) - soup代表的是整个网页,在整个网页中获取css选择器选中的所有标签,返回值是一个列表,列表中的元素是标签对象
soup.select_one(css选择器) - 在整个网页中获取css选择器选中的第一个标签,返回值是标签对象
result = soup.select('#box1 p')
print(result) # [<p>肖生克的救赎</p>, <p>霸王别姬</p>, <p>阿甘正传</p>]
result = soup.select_one('#box1 p')
print(result) # <p>肖生克的救赎</p>
标签对象.select(css选择器) - 在指定标签中获取css选择器选中的所有表情,返回值是一个列表,列表中元素是标签对象
标签对象.select_one(css选择器) - 在指定标签中获取css选择器选中的第一个标签,返回值是标签对象
result = soup.select('p')
print(result) # [<p>肖生克的救赎</p>, <p>霸王别姬</p>, <p>阿甘正传</p>, <p>我是段落1</p>]
box2 = soup.select_one('#box2')
result = box2.select('p')
print(result) # [<p>我是段落1</p>]
4)获取标签内容和标签属性
p = soup.select_one('p') # <p>肖生克的救赎</p>
img = soup.select_one('img')
a. 获取标签内容: 标签对象.text
print(p.text) # '肖生克的救赎'
b.获取标签属性值: 标签对象.attrs[属性名]
print(img.attrs['src'], img.attrs['alt'])
5. bs4解析某瓣电影单页数据
import requests
from bs4 import BeautifulSoup
import csv
# 1.请求网页
def get_net_data(url: str):
Headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.5112.81 Safari/537.36 Edg/104.0.1293.47'
}
response = requests.get(url, headers=Headers)
print(response.status_code)
return response.text
# 2.网页数据解析
def analysis_data(html: str):
soup = BeautifulSoup(html, 'lxml')
all_film_div = soup.select('grid_view>li>.item')
all_data = []
for div in all_film_div:
name = div.select_one('.title').text
info = div.select_one('.bd>p').text.strip().split('\n')[-1].strip()
time, country, category = info.split('/')
score = div.select_one('.rating_num').text
comment_count = div.select('.star>span')[-1].text[:-3]
intro = div.select_one('.inq').text
all_data.append([name, score, time.strip(), country.strip(), category.strip(), comment_count, intro])
# 保存数据
f = open('第一页电影数据.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(f)
writer.writerow(['电影名', '评分', '上映时间', '国家', '类型', '评论数', '简介'])
writer.writerows(all_data)
if __name__ == '__main__':
result = get_net_data('https://movie.douban.com/top250')
analysis_data(result)
6. bs4解析某瓣电影十页数据
import requests
from bs4 import BeautifulSoup
import csv
# 1. 获取网页数据
def get_net_data(url: str):
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
return response.text
# 2. 网页数据解析
def analysis_data(html: str):
# 解析数据
soup = BeautifulSoup(html, 'lxml')
all_film_div = soup.select('.grid_view>li>.item')
all_data = []
for div in all_film_div:
name = div.select_one('.title').text
info = div.select_one('.bd>p').text.strip().split('\n')[-1].strip()
# time, country, category = info.split('/')
info_list = info.split('/')
time = info_list[0]
country = info_list[-2]
category = info_list[-1]
score = div.select_one('.rating_num').text
comment_count = div.select('.star>span')[-1].text[:-3]
intro_span = div.select_one('.inq')
if intro_span:
intro = intro_span.text
else:
intro = ''
all_data.append([name, score, time.strip(), country.strip(), category.strip(), comment_count, intro])
writer.writerows(all_data)
print('保存成功!')
if __name__ == '__main__':
f = open('豆瓣电影.csv', 'w', encoding='utf-8', newline='')
writer = csv.writer(f)
writer.writerow(['电影名', '评分', '上映时间', '国家', '类型', '评论数', '简介'])
for page in range(0, 251, 25):
url = f'https://movie.douban.com/top250?start={page}&filter='
result = get_net_data(url)
analysis_data(result)