4.2学习笔记
一、实例——房天下
import requests
from lxml import etree
from pyquery import PyQuery
import csv
# 字符串.strip() - 将字符串前面和后面的空白去掉
def an_data(data: str):
# html = etree.HTML(data)
# li_list = html.xpath('//div[@id="newhouse_loupai_list"]/ul/li')
# print(li_list)
# # 'abc'.strip()
# for x in li_list:
# name = x.xpath('./div/div[last()]/div[1]/div[1]/a/text()')
# if name:
# print(name[0].strip())
html = PyQuery(data)
lis = html('#newhouse_loupai_list>ul>li')
house_list = []
for x in lis:
house = {}
p_x = PyQuery(x)
# 楼盘名称
name_a = p_x('.nlcd_name>a')
name = name_a.text()
url = name_a.attr('href')
new_url = 'https:'+url if url else url
house['楼盘名'] = name
house['连接'] = url
# 售价
price = p_x('.nhouse_price>span').text()
# print(name, new_url, price)
house['售价'] = price
# 面积
area = p_x('.house_type.clearfix>a')
# print(area.text())
house['户型/面积'] = area.text()
# 地址
address = p_x('.address').text()
# print(address)
house['地址'] = address
# 销售情况和flag
box = p_x('.fangyuan')
info = box('span').text()
flags = box('a').text()
# print(info, flags)
house['销售情况'] = info
house['特点'] = flags
print('==========================================')
house_list.append(house)
# 判断当前是否获取到数据
if not house_list:
return True
with open('files/成都新房.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, list(house_list[0].keys()))
# writer.writeheader()
writer.writerows(house_list)
def get_data(index):
url = 'https://cd.newhouse.fang.com/house/s/b9'+str(index)
response = requests.get(url)
# 解决乱码:将encoding的值设置成和网页中meta标签中charset的属性值一样
response.encoding = 'gb2312'
if response.status_code == 200:
# print(response.text)
return an_data(response.text)
else:
print('获取失败!')
def get_all_data():
index = 1
while True:
if get_data(index):
break
index += 1
print(index)
get_all_data()
二、excel文件的读操作
import openpyxl
# 1.加载excel文件
work_book = openpyxl.load_workbook('files/example.xlsx')
# 2.获取表相关数据
# 1) 获取表名
names = work_book.sheetnames
print(names) # ['排行榜', '学生信息表', '学生信息表1']
# 2) 获取表对象
# a.根据表名获取工作表对象
sheet1 = work_book['排行榜']
print(sheet1) # <Worksheet "排行榜">
# b.获取活动表
sheet2 = work_book.active
print(sheet2) # <Worksheet "排行榜">
# c.获取所有表
all_sheet = work_book.worksheets
print(all_sheet) # [<Worksheet "排行榜">, <Worksheet "学生信息表">, <Worksheet "学生信息表1">]
sheet3 = all_sheet[-1]
print(sheet3) # <Worksheet "学生信息表1">
# 3. 根据表获取表相关信息
# a. 获取表名
print(sheet1.title) # '排行榜'
# b.获取最大的行数和最大列表
# 表对象.max_row
# 表对象.max_column
max_row = sheet1.max_row
max_column = sheet1.max_column
print(max_row, max_column)
# 4.获取单元格
# a.获取一个单元格
# 表对象['标号'] - 标号格式:'字母列表数字行号'
cell1 = sheet1['A1']
print(cell1) # <Cell '排行榜'.A1>
# 表对象.cell(行号, 列号) - 行号和列号都是从1开始的数字
cell2 = sheet1.cell(3, 2)
print(cell2)
# b.获取部分单元格
# 一行一行的取
cells1 = sheet1.iter_rows() # 一行一行的获取整个列表中所有的单元格
print(list(cells1))
cells2 = sheet1.iter_rows(min_row=4, max_row=9, min_col=1, max_col=2)
print(list(cells2))
# 一列一列的取
cells3 = sheet1.iter_cols()
# print(list(cells3))
scores_cell = sheet1.iter_cols(min_row=2, min_col=3)
print(list(scores_cell))
# 切片
# 表格对象[左上角标号:右下角标号]
cells4 = sheet1['b3':'C9']
print(cells4)
# 5. 获取单元格信息
# 1)获取单元格中的数据
print(cell1.value)
# 2)获取位置信息
print(cell1.row) # 1
print(cell1.column) # 1
print(cell1.coordinate) # 'A1'
for x in cells3:
for cell in x:
print(cell.value)
三、excel文件的写操作
from pyecharts.charts import Bar, Line,Kline
from pyecharts.options import TitleOpts,LegendOpts,ToolboxOpts,DataZoomOpts, LabelOpts, \
MarkPointOpts, MarkPointItem, MarkLineOpts,MarkLineItem, AxisOpts,SplitAreaOpts, AreaStyleOpts
import openpyxl
from datetime import datetime
def get_data():
wb = openpyxl.load_workbook('files/腾讯2017年股票数据.xlsx')
sheet = wb.active
result = sheet.iter_rows(min_row=2, max_col=5)
datas = []
for row in result:
data = []
for cell in row:
data.append(cell.value)
time = data[0]
data[0] = f'{time.year}-{time.month}-{time.day}'
datas.append(data)
dates = []
prices = []
for x in datas:
dates.append(x[0])
prices.append(x[1:])
return dates, prices
# print(get_data())
# datetime.datetime(2017, 1, 3, 0, 0)
def create_kline():
kline = Kline()
date, price = get_data()
# kline.add_xaxis(['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009'])
# kline.add_yaxis('', [[四个数据]])
kline.add_xaxis(date)
kline.add_yaxis('', price)
kline.set_global_opts(
datazoom_opts=DataZoomOpts(is_show=True),
yaxis_opts=AxisOpts(
is_scale=True,
splitarea_opts=SplitAreaOpts(
is_show=True, areastyle_opts=AreaStyleOpts(opacity=1)
)
)
)
kline.render('files/k线图.html')
create_kline()
# 2.折线图
def create_line():
line = Line()
line.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
line.add_yaxis('确诊人数', [120, 98, 456, 2837, 897, 1020, 34],
markline_opts=MarkLineOpts(
data=[MarkLineItem(type_='average', name='平均值')]
)
)
line.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
line.render('files/line.html')
# create_line()
# 1.柱状图
def create_bar():
# 1.创建图表对象
bar = Bar()
# 2.添加数据
# 设置x轴上显示的内容
bar.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
bar.add_yaxis('确证人数', [120, 98, 456, 2837, 897, 1020, 34])
bar.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
# 3.设置(可以没有)
# 全局配置
bar.set_global_opts(
# 设置标题
title_opts=TitleOpts(title='全国疫情信息表', subtitle='确诊人数和死亡人数', pos_left=50),
# 隐藏图例
legend_opts=LegendOpts(is_show=False),
# 显示工具箱
toolbox_opts=ToolboxOpts(is_show=True),
# 显示缩放工具
datazoom_opts=DataZoomOpts(is_show=True)
)
# 系列配置
bar.set_series_opts(
label_opts=LabelOpts(
# 是否显现数据
is_show=True,
position='left'
),
# markpoint_opts=MarkPointOpts([
# MarkPointItem(type_='max', name='最大值'),
# MarkPointItem(type_='min', name='最小值')
# ])
)
# 4.渲染
bar.render('files/bar.html')
# create_bar()
四、饼状图
from pyecharts.charts import Pie
from pyecharts import options
# 1. 准备数据
cate_data = [
('苹果', 560),
('华为', 789),
('小米', 623),
('oppo', 200)
]
# 2. 创建表对象
pie = Pie()
# 3.关联数据
pie.add('月销量', cate_data,
# 设置圆的半径:[中心空的圆的半径, 整个大圆的半径]
radius=['50', '150'],
# 设置不同数据的半径不一致的效果
rosetype='radius')
# 5.配置
pie.set_series_opts(
# {b} -> 名称 {d} -> 百分比
label_opts=options.LabelOpts(formatter='{b}-{d}%')
)
# 4. 生成图表
pie.render('files/饼状图.html')
五、实例——地图
from pyecharts.charts import Map
from pyecharts import options
# 1.准备数据
cate_data = [
('重庆', 579),
('黑龙江', 925),
('北京', 593),
('山西', 197),
('浙江', 1268),
('新疆', 290)
]
# 2. 创建地图对象
map = Map()
# 3.关联数据
map.add('确诊人数', cate_data, maptype='china', zoom=0.7)
# 5. 配置
map.set_global_opts(
title_opts=options.TitleOpts(title='全国疫情信息'),
# 设置颜色块(*)
visualmap_opts=options.VisualMapOpts(max_=2000, min_=0, is_piecewise=True),
# 顶部的图例
legend_opts=options.LegendOpts(is_show=False)
)
# 4. 生成图表
map.render('files/地图.html')
六、实例——企查查
import requests
from bs4 import BeautifulSoup
import os
import openpyxl
def an_data(data: str):
soup = BeautifulSoup(data, 'lxml')
all_company = soup.select('.ntable>tr')
all_data = []
for company in all_company:
c = {}
title_a = company.select('.title')[0]
# 企业链接
url = title_a.attrs['href']
c['url'] = url
# 企业名称
name = title_a.select('span')[0].get_text()
c['name'] = name
# 法定代表人
ceo = company.select('.val')[0].get_text()
c['ceo'] = ceo
# print(ceo)
# 电话
tel = company.select('.val')[3].get_text()
c['tel'] = tel
# print(tel)
# 注册资金
money = company.select('.val')[1].get_text()
# print(money)
c['money'] = money
all_data.append(c)
# print(all_data)
return all_data
def get_data(index):
url = 'https://www.qcc.com/web/search?key=科技&p='+str(index)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'cookie': 'QCCSESSID=qcil9523aq26u5igmmo9mcm3r7; zg_did=%7B%22did%22%3A%20%22178905f87192d4-0a9af7c29ac9bb-1633685b-fa000-178905f871a49c%22%7D; UM_distinctid=178905f8dea1df-03d3261599966d-1633685b-fa000-178905f8deb349; hasShow=1; _uab_collina=161732988514123890189604; acw_tc=7d40019f16173433620472409ead0a0f89abba0b90cfd8e6695135426c; CNZZDATA1254842228=1019155693-1617329158-%7C1617339958; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201617343363693%2C%22updated%22%3A%201617343471568%2C%22info%22%3A%201617329882920%2C%22superProperty%22%3A%20%22%7B%5C%22%E5%BA%94%E7%94%A8%E5%90%8D%E7%A7%B0%5C%22%3A%20%5C%22%E4%BC%81%E6%9F%A5%E6%9F%A5%E7%BD%91%E7%AB%99%5C%22%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%2213c101d390dcf33467df46d59cfb4a54%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
# print(response.text)
# an_data(response.text)
return response.text
else:
print('请求失败')
def save_data(data: list):
# 企查查.xlsx
if not os.path.exists('files/企查查.xlsx'):
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = '科技'
else:
wb = openpyxl.load_workbook('files/企查查.xlsx')
sheet = wb.active
max_row = sheet.max_row
# print(max_row)
if max_row == 1:
# 写标题
titles = list(data[0].keys())
for index in range(len(titles)):
sheet.cell(1, index+1).value = titles[index]
# 写数据
for index in range(len(data)):
dict1 = data[index]
row = index+1+max_row
col = 1
for key in dict1:
sheet.cell(row, col).value = dict1[key]
col += 1
wb.save('files/企查查.xlsx')
for x in range(2, 11):
data = get_data(x)
if data:
data_list = an_data(data)
save_data(data_list)
七、pyecharts的使用
from pyecharts.charts import Bar, Line,Kline
from pyecharts.options import TitleOpts,LegendOpts,ToolboxOpts,DataZoomOpts, LabelOpts, \
MarkPointOpts, MarkPointItem, MarkLineOpts,MarkLineItem, AxisOpts,SplitAreaOpts, AreaStyleOpts
import openpyxl
from datetime import datetime
def get_data():
wb = openpyxl.load_workbook('files/腾讯2017年股票数据.xlsx')
sheet = wb.active
result = sheet.iter_rows(min_row=2, max_col=5)
datas = []
for row in result:
data = []
for cell in row:
data.append(cell.value)
time = data[0]
data[0] = f'{time.year}-{time.month}-{time.day}'
datas.append(data)
dates = []
prices = []
for x in datas:
dates.append(x[0])
prices.append(x[1:])
return dates, prices
# print(get_data())
# datetime.datetime(2017, 1, 3, 0, 0)
def create_kline():
kline = Kline()
date, price = get_data()
# kline.add_xaxis(['2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009'])
# kline.add_yaxis('', [[四个数据]])
kline.add_xaxis(date)
kline.add_yaxis('', price)
kline.set_global_opts(
datazoom_opts=DataZoomOpts(is_show=True),
yaxis_opts=AxisOpts(
is_scale=True,
splitarea_opts=SplitAreaOpts(
is_show=True, areastyle_opts=AreaStyleOpts(opacity=1)
)
)
)
kline.render('files/k线图.html')
create_kline()
# 2.折线图
def create_line():
line = Line()
line.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
line.add_yaxis('确诊人数', [120, 98, 456, 2837, 897, 1020, 34],
markline_opts=MarkLineOpts(
data=[MarkLineItem(type_='average', name='平均值')]
)
)
line.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
line.render('files/line.html')
# create_line()
# 1.柱状图
def create_bar():
# 1.创建图表对象
bar = Bar()
# 2.添加数据
# 设置x轴上显示的内容
bar.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
bar.add_yaxis('确证人数', [120, 98, 456, 2837, 897, 1020, 34])
bar.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
# 3.设置(可以没有)
# 全局配置
bar.set_global_opts(
# 设置标题
title_opts=TitleOpts(title='全国疫情信息表', subtitle='确诊人数和死亡人数', pos_left=50),
# 隐藏图例
legend_opts=LegendOpts(is_show=False),
# 显示工具箱
toolbox_opts=ToolboxOpts(is_show=True),
# 显示缩放工具
datazoom_opts=DataZoomOpts(is_show=True)
)
# 系列配置
bar.set_series_opts(
label_opts=LabelOpts(
# 是否显现数据
is_show=True,
position='left'
),
# markpoint_opts=MarkPointOpts([
# MarkPointItem(type_='max', name='最大值'),
# MarkPointItem(type_='min', name='最小值')
# ])
)
# 4.渲染
bar.render('files/bar.html')