01 csv读写操作
1.什么是csv文件:
csv文件叫 逗号分隔值文件,像Excel文件一样以行列的形式保存数据,保存数据的时候同一行的多列数据用逗号隔开。
2.csv文件的读写操作
2.1 csv文件读操作
import csv
from csv import reader, DictReader
2.1.1 reader
创建根据文件对象创建对应的reader,获取文件内容
print(‘reader----------------------------------------’)
r1 = reader(open(‘files/movie.csv’, encoding=‘utf-8’, newline=‘’))
with open('files/movie.csv', encoding='utf-8', newline='') as f:
r1 = reader(f)
print(r1)
print(next(r1))
print('----------------------------------------')
print(list(r1))
print('DictReader----------------------------------------')
with open('files/movie.csv', encoding='utf-8', newline='') as f:
r2 = DictReader(f)
print(r2)
print(next(r2))
# print('----------------------------------------')
# print(list(r2))
print('2----------------------------------------')
for x in r2:
print(x)
2.2 csv写操作
from csv import writer, DictWriter
2.2.1 writer
根据文件对象创建writer对象
with open('files/student1.csv', 'w', encoding="utf-8", newline='') as f:
w1 = writer(f)
# 一次写一行(也可以写元组)
w1.writerow(['姓名', '性别', '年龄'])
w1.writerow(['小明', '男', 22])
# 一次写多行(也可以写元组)
w1.writerows([
['小花', '女', '18'],
['张三', '男', '30']
])
2.2.2 DictWriter
with open('files/student2.csv', 'w', encoding='utf-8', newline='') as f:
# 需要先提供键key
w2 = DictWriter(f, ['姓名', '性别', '年龄'])
w2.writeheader()
# 一次写一行
w2.writerow({'姓名': '小明', '性别':'男', '年龄': 22})
# 一次写多行
w2.writerows([
{'姓名': '小花', '性别': '女', '年龄': 18},
{'姓名': '张飒', '性别': '男', '年龄': 30}
])
02 bs4(Beautifulsoup4)
bs4(beautifulsoup4),它基于css选择器的网页解析器
安装Packages时是装BeautifulSoup4
from bs4 import BeautifulSoup
1.根据网页源代码创建soup对象
f = open('files/data.html', encoding='utf-8')
soup = BeautifulSoup(f.read(), 'lxml')
f.close()
2.获取标签
soup对象.select(css选择器) - 整个网页中选择器选择的标签,返回值是个list,列表中的元素是标签对象
soup对象.select_one(css选择器) - 获取整个网页中选择器选中的第一个标签,返回值是一个标签,若没有则返None
标签对象.select(css选择器) - 获取指定标签中css选择器选中的所有标签
标签对象.select_one(css选择器) - 获取指定标签中css选择器选中的第一个标签
s = soup.select('.c1')
print(s)
print('----------------------------------------')
s1 = soup.select_one('.c1')
print(s1)
``
### 3.获取标签内容和标签属性
> 标签.text # 获取标签内容
> 标签.attrs['属性名'] # 获取标签属性
## 03 bs4 of DouBan
```python
import requests
from bs4 import BeautifulSoup
1.获取网页源代码
headers = {
'cookie': r'此处省略',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36'
}
response = requests.get('https://movie.douban.com/top250', headers=headers)
html = response.text
### 2.解析数据
soup = BeautifulSoup(html, 'lxml')
### 3.获取每一个电影的信息
div_list = soup.select('.grid_view>li>div')
for x in div_list:
name = x.select_one('.title ').text
score = x.select_one('.rating_num ').text
comment = x.select('.star>span')[-1].text[:-3]
print(name, score, comment)
04 bs4+csv作业
import requests
from re import *
from bs4 import BeautifulSoup
from csv import reader, DictReader, writer
def BeiKe_information_collector(website):
f = open('files/BeiKes.csv', 'w', encoding='utf-8', newline='')
w1 = writer(f)
w1.writerow(['房屋信息', '区域', '面积', '户型', '价格'])
# 获取网页源码
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.5359.95 Safari/537.36'
}
response = requests.get(website, headers=headers)
html = response.text
# 解析源码数据
soup = BeautifulSoup(html, 'lxml')
# 定义新空的所有区域列表及所有区域均价字典
# zones = []
# zones_aver_price = {}
# 获取每一个房源信息
div_list = soup.select('.content__list--item--main')
for x in div_list:
twoline = x.select_one('.twoline').text
zoen = x.select_one('.content__list--item--des').text
# 我的写法:
# zoen = findall(r'(?s)(.*/)?(.*)/(.*)/.*/(.*)/.*', zoen)
# zoen = [y.strip() for y in zoen[0]]
# 老师的写法:
zoen = sub(r'\s+', '', zoen)
address = x.select('.content__list--item--des>a')
new_address = '-'.join([y.text for y in address])
area = findall(r'\d+\.\d+㎡', zoen)[0]
house_type = findall(r'\d+室\d+厅\d+卫', zoen)[0]
price = x.select_one('.content__list--item-price').text
# print(twoline, new_address, area, house_type, price)
# 统计不同区域租金价格的平均值
# for y in address[0]:
# zones.append(y)
# 所有信息汇总写入CSV文件中
# 我的写法
# w1.writerow([twoline.strip(), zoen[1], zoen[2], zoen[3], price])
# 老师的写法
w1.writerow([twoline.strip(), new_address, area, house_type, price])
# 区域列表去重
# zones = list(set(zones))
# 租金合集
# for y in zones:
# count_price = []
# count = 0
# for z in div_list:
# address = z.select_one('.content__list--item--des>a').text
# if address == y:
# price = z.select_one('.content__list--item-price').text
# count_price.append(price[:-4])
# count += 1
# sum_price = eval('+'.join(count_price))
# aver_price = int(sum_price / count)
#
# zones_aver_price.setdefault(y, aver_price)
# new_zones_aver_price = sorted(zones_aver_price.items(), key=lambda item: item[1])
# print(dict(new_zones_aver_price))
f.close()
# def Zones_Aver_Price():
# with open('files/BeiKes.csv', encoding='utf-8', newline='') as f:
# r2 = DictReader(f)
# # print(r2)
# print(next(r2)['区域'][0])
if __name__ == '__main__':
for x in range(0, 2):
url = rf'https://cd.zu.ke.com/zufang/pg{x}/#contentList'
BeiKe_information_collector(url)
print('=============一页获取完成================')
# Zones_Aver_Price()