01房天下
import requests
from lxml import etree
from pyquery import PyQuery
import csv
def an_data(data: str):
html = PyQuery(data)
lis = html('#newhouse_loupai_list>ul>li')
house_list = []
for x in lis:
house = {}
p_x = PyQuery(x)
name_a = p_x('.nlcd_name>a')
name = name_a.text()
url = name_a.attr('href')
new_url = 'https:'+url if url else url
house['楼盘名'] = name
house['连接'] = url
price = p_x('.nhouse_price>span').text()
house['售价'] = price
area = p_x('.house_type.clearfix>a')
house['户型/面积'] = area.text()
address = p_x('.address').text()
house['地址'] = address
box = p_x('.fangyuan')
info = box('span').text()
flags = box('a').text()
house['销售情况'] = info
house['特点'] = flags
print('==========================================')
house_list.append(house)
if not house_list:
return True
with open('files/成都新房.csv', 'a', encoding='utf-8', newline='') as f:
writer = csv.DictWriter(f, list(house_list[0].keys()))
writer.writerows(house_list)
def get_data(index):
url = 'https://cd.newhouse.fang.com/house/s/b9'+str(index)
response = requests.get(url)
response.encoding = 'gb2312'
if response.status_code == 200:
return an_data(response.text)
else:
print('获取失败!')
def get_all_data():
index = 1
while True:
if get_data(index):
break
index += 1
print(index)
get_all_data()
02excel文件的读操作
import openpyxl
work_book = openpyxl.load_workbook('files/example.xlsx')
names = work_book.sheetnames
print(names)
sheet1 = work_book['排行榜']
print(sheet1)
sheet2 = work_book.active
print(sheet2)
all_sheet = work_book.worksheets
print(all_sheet)
sheet3 = all_sheet[-1]
print(sheet3)
print(sheet1.title)
max_row = sheet1.max_row
max_column = sheet1.max_column
print(max_row, max_column)
cell1 = sheet1['A1']
print(cell1)
cell2 = sheet1.cell(3, 2)
print(cell2)
cells1 = sheet1.iter_rows()
print(list(cells1))
cells2 = sheet1.iter_rows(min_row=4, max_row=9, min_col=1, max_col=2)
print(list(cells2))
cells3 = sheet1.iter_cols()
scores_cell = sheet1.iter_cols(min_row=2, min_col=3)
print(list(scores_cell))
cells4 = sheet1['b3':'C9']
print(cells4)
print(cell1.value)
print(cell1.row)
print(cell1.column)
print(cell1.coordinate)
for x in cells3:
for cell in x:
print(cell.value)
03excel文件写操作
import openpyxl
wb = openpyxl.load_workbook('files/new_ex.xlsx')
sheet1 = wb['商品信息']
wb.save('files/new_ex.xlsx')
03饼状图
from pyecharts.charts import Pie
from pyecharts import options
cate_data = [
('苹果', 560),
('华为', 789),
('小米', 623),
('oppo', 200)
]
pie = Pie()
pie.add('月销量', cate_data,
radius=['50', '150'],
rosetype='radius')
pie.set_series_opts(
label_opts=options.LabelOpts(formatter='{b}-{d}%')
)
pie.render('files/饼状图.html')
04企查查
import requests
from bs4 import BeautifulSoup
import os
import openpyxl
def an_data(data: str):
soup = BeautifulSoup(data, 'lxml')
all_company = soup.select('.ntable>tr')
all_data = []
for company in all_company:
c = {}
title_a = company.select('.title')[0]
url = title_a.attrs['href']
c['url'] = url
name = title_a.select('span')[0].get_text()
c['name'] = name
ceo = company.select('.val')[0].get_text()
c['ceo'] = ceo
tel = company.select('.val')[3].get_text()
c['tel'] = tel
money = company.select('.val')[1].get_text()
c['money'] = money
all_data.append(c)
return all_data
def get_data(index):
url = 'https://www.qcc.com/web/search?key=科技&p='+str(index)
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.114 Safari/537.36',
'cookie': 'QCCSESSID=qcil9523aq26u5igmmo9mcm3r7; zg_did=%7B%22did%22%3A%20%22178905f87192d4-0a9af7c29ac9bb-1633685b-fa000-178905f871a49c%22%7D; UM_distinctid=178905f8dea1df-03d3261599966d-1633685b-fa000-178905f8deb349; hasShow=1; _uab_collina=161732988514123890189604; acw_tc=7d40019f16173433620472409ead0a0f89abba0b90cfd8e6695135426c; CNZZDATA1254842228=1019155693-1617329158-%7C1617339958; zg_de1d1a35bfa24ce29bbf2c7eb17e6c4f=%7B%22sid%22%3A%201617343363693%2C%22updated%22%3A%201617343471568%2C%22info%22%3A%201617329882920%2C%22superProperty%22%3A%20%22%7B%5C%22%E5%BA%94%E7%94%A8%E5%90%8D%E7%A7%B0%5C%22%3A%20%5C%22%E4%BC%81%E6%9F%A5%E6%9F%A5%E7%BD%91%E7%AB%99%5C%22%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22%22%2C%22cuid%22%3A%20%2213c101d390dcf33467df46d59cfb4a54%22%2C%22zs%22%3A%200%2C%22sc%22%3A%200%7D'
}
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
else:
print('请求失败')
def save_data(data: list):
if not os.path.exists('files/企查查.xlsx'):
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title = '科技'
else:
wb = openpyxl.load_workbook('files/企查查.xlsx')
sheet = wb.active
max_row = sheet.max_row
if max_row == 1:
titles = list(data[0].keys())
for index in range(len(titles)):
sheet.cell(1, index+1).value = titles[index]
for index in range(len(data)):
dict1 = data[index]
row = index+1+max_row
col = 1
for key in dict1:
sheet.cell(row, col).value = dict1[key]
col += 1
wb.save('files/企查查.xlsx')
for x in range(2, 11):
data = get_data(x)
if data:
data_list = an_data(data)
save_data(data_list)
04地图
from pyecharts.charts import Map
from pyecharts import options
cate_data = [
('重庆', 579),
('黑龙江', 925),
('北京', 593),
('山西', 197),
('浙江', 1268),
('新疆', 290)
]
map = Map()
map.add('确诊人数', cate_data, maptype='china', zoom=0.7)
map.set_global_opts(
title_opts=options.TitleOpts(title='全国疫情信息'),
visualmap_opts=options.VisualMapOpts(max_=2000, min_=0, is_piecewise=True),
legend_opts=options.LegendOpts(is_show=False)
)
map.render('files/地图.html')
05pyecharts的使用
from pyecharts.charts import Bar, Line,Kline
from pyecharts.options import TitleOpts,LegendOpts,ToolboxOpts,DataZoomOpts, LabelOpts, \
MarkPointOpts, MarkPointItem, MarkLineOpts,MarkLineItem, AxisOpts,SplitAreaOpts, AreaStyleOpts
import openpyxl
from datetime import datetime
def get_data():
wb = openpyxl.load_workbook('files/腾讯2017年股票数据.xlsx')
sheet = wb.active
result = sheet.iter_rows(min_row=2, max_col=5)
datas = []
for row in result:
data = []
for cell in row:
data.append(cell.value)
time = data[0]
data[0] = f'{time.year}-{time.month}-{time.day}'
datas.append(data)
dates = []
prices = []
for x in datas:
dates.append(x[0])
prices.append(x[1:])
return dates, prices
def create_kline():
kline = Kline()
date, price = get_data()
kline.add_xaxis(date)
kline.add_yaxis('', price)
kline.set_global_opts(
datazoom_opts=DataZoomOpts(is_show=True),
yaxis_opts=AxisOpts(
is_scale=True,
splitarea_opts=SplitAreaOpts(
is_show=True, areastyle_opts=AreaStyleOpts(opacity=1)
)
)
)
kline.render('files/k线图.html')
create_kline()
def create_line():
line = Line()
line.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
line.add_yaxis('确诊人数', [120, 98, 456, 2837, 897, 1020, 34],
markline_opts=MarkLineOpts(
data=[MarkLineItem(type_='average', name='平均值')]
)
)
line.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
line.render('files/line.html')
def create_bar():
bar = Bar()
bar.add_xaxis(['成都', '重庆', '北京', '武汉', '深圳', '上海', '昆明'])
bar.add_yaxis('确证人数', [120, 98, 456, 2837, 897, 1020, 34])
bar.add_yaxis('死亡人数', [90, 50, 102, 340, 201, 290, 5])
bar.set_global_opts(
title_opts=TitleOpts(title='全国疫情信息表', subtitle='确诊人数和死亡人数', pos_left=50),
legend_opts=LegendOpts(is_show=False),
toolbox_opts=ToolboxOpts(is_show=True),
datazoom_opts=DataZoomOpts(is_show=True)
)
bar.set_series_opts(
label_opts=LabelOpts(
is_show=True,
position='left'
),
)
bar.render('files/bar.html')