day3 csv和Bs4
01 获取网页数据
import csv
import requests
from re import findall
def get_one_page(start=0):
url = f'https://movie.douban.com/top250?start={start}&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
names = findall(r'<img width="100" alt="(.+?)"', html)
info = findall(r'(?s)<p class="">(.+?)</p>', html)
info = [x.strip().split('\n')[-1].strip() for x in info]
times = []
countries = []
types = []
for x in info:
result = x.split(' / ')
times.append(result[0])
countries.append(result[1])
types.append(result[2])
score = findall(r'<span class="rating_num" property="v:average">(.+?)</span>', html)
comment = findall(r'<span>(\d+)人评价</span>', html)
data = map(lambda i1, i2, i3, i4, i5, i6: (i1, i2, i3, i4, i5, i6), names, score, comment, times, countries, types)
w2.writerows(data)
print('-------------------------------一页获取完成-----------------------')
def get_one_page2():
url = 'https://movie.douban.com/top250?start=0&filter='
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get(url, headers=headers)
html = response.text
result = findall(r'(?s)<img width="100" alt="(.+?)".+?<p class="">(.+?)</p>.+?<span class="rating_num" property="v:average">(.+?)</span>.+?<span>(\d+)人评价</span>', html)
print(result)
from datetime import datetime
from csv import writer
if __name__ == '__main__':
f = open('files/top250.csv', 'w', encoding='utf-8', newline='')
w2 = writer(f)
w2.writerow(['电影名称', '评分', '评论人数', '上映时间', '国家', '类型'])
for x in range(0, 226, 25):
get_one_page(x)
f.close()
02 csv文件读和写
from csv import reader, DictReader
with open('files/电影.csv', encoding='utf-8', newline='') as f:
r1 = reader(f)
print(next(r1))
print(list(r1))
print('--------------------------------华丽的分割线-------------------------------------')
with open('files/电影.csv', encoding='utf-8', newline='') as f:
r2 = DictReader(f)
print(next(r2))
for x in r2:
print(x)
from csv import writer, DictWriter
with open('files/student1.csv', 'w', encoding='utf-8', newline='') as f:
w1 = writer(f)
w1.writerow(['姓名', '性别', '年龄'])
w1.writerow(['小明', '男', 22])
w1.writerows([
('小花', '女', 18),
['张三', '男', 30]
])
with open('files/student2.csv', 'w', encoding='utf-8', newline='') as f:
w2 = DictWriter(f, ['姓名', '性别', '年龄'])
w2.writeheader()
w2.writerow({'姓名': '小明', '性别': '男', '年龄': 22})
w2.writerows([
{'姓名': '小花', '性别': '女', '年龄': 17},
{'姓名': '小红', '性别': '女', '年龄': 18},
{'姓名': '张三', '性别': '男', '年龄': 30}
])
03 bs4
"""
css语法:
选择器{属性名1: 属性值2; 属性名2: 属性值2; ....}
常见属性:color(设置字体颜色)、 background-color(背景颜色)、font-size(字体大小)、width(宽度)、height(高度)、border(边框)
选择器:
1. 元素选择器(标签选择器) - 将标签作为选择器,选中所有指定的标签
a{} - 选中所有的a标签
p{} - 选中所有的p标签
span{} - 选中所有的span标签
2. id选择器 - 在标签的id属性前加#作为一个选择器,选中id属性值为指定值的标签
注意:一个网页中id属性值是唯一的
#a{} - 选中id属性值为a的标签
#b1{} - 选中id属性值为b1的标签
3. class选择器 - 在标签的class属性前加.作为一个选择器,选中所有class属性值为指定值的标签
注意:一个网页中多个标签的class属性值可以相同;同一个标签可以有多个不同的class
只有一个class属性值标签的写法:<标签名 class="c1">
有多个class属性值标签的写法:<标签名 class="c1 c2 c3">
.a{} - 选中class属性值为a标签
.c1{} - 选中class属性值为c1的标签
.a.b{} - 选中class属性值同时为a和b标签
a.c1{} - 选中所有class值为c1的a标签
4. 子代选择器 - 将两个选择器用>连接成一个选择器(前后形成父子关系)
div>a{} - 选中所有在div标签中的a标签(a标签必须是div的子标签)
5. 后代选择器 - 将两个选择器用空格连接成一个选择器(前后形成后代关系)
div a{} - 选中所有在div标签中的a标签(a标签必须是div的后代标签)
"""
from bs4 import BeautifulSoup
f = open('files/data.html', encoding='utf-8')
soup = BeautifulSoup(f.read(), 'lxml')
f.close()
result = soup.select('p')
print(result)
box1 = soup.select_one('#box1')
result = box1.select('p')
print(result)
p1 = soup.select_one('span>p')
a1 = box1.select_one('a')
print(p1.text)
print(a1.text)
print(a1.attrs['href'])
04 bs4爬取豆瓣网
import requests
from bs4 import BeautifulSoup
headers = {
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text
soup = BeautifulSoup(html, 'lxml')
div_list = soup.select('.grid_view>li>div')
for x in div_list:
name = x.select_one('.title').text
score = x.select_one('.rating_num').text
comment = x.select('.star>span')[-1].text[:-3]
print(name, score, comment)