爬取数据部分
导包
from selenium. webdriver import Chrome
from selenium. webdriver. support. select import Select
from time import sleep
from selenium. webdriver. chrome. options import Options
无头和防屏蔽设置(固定代码,一般最后加上)
opt = Options( )
opt. add_argument( '--headless' )
opt. add_argument( '--disable gpu' )
请求网址并事先创建一个文件
chrome = Chrome( options= opt)
url = 'https://www.endata.com.cn/BoxOffice/BO/Year/index.html'
chrome. get( url)
sleep( 1 )
f = open ( './movies.csv' , mode= 'a' , encoding= 'utf-8' )
support库里面的Select模块处理年份问题
select_el = chrome. find_element_by_id( 'OptionDate' )
select = Select( select_el)
for i in range ( len ( select. options) ) :
select. select_by_index( i)
sleep( 2 )
tr_list = chrome. find_elements_by_xpath( '//table[@class="bo-table img-table"]//tr' ) [ 1 : ]
for tr in tr_list:
for td in tr. find_elements_by_xpath( './td' ) :
f. write( td. text. strip( ) )
f. write( ',' )
f. write( '\n' )
f. write( '\n\n' )
print ( '第%d页打印完毕' % i)
chrome. close( )
数据处理部分
导包
import pandas as pd
数据的清洗,去重
data = pd. read_csv( './movies.csv' , header= None )
data = data. loc[ : , 2 : 4 ]
data[ 5 ] = data[ 2 ] . map ( lambda i: i. split( '/' ) [ 0 ] )
data[ 6 ] = data[ 2 ] . map ( lambda i: i. split( '/' ) [ 1 ] if '/' in i else 666 )
data[ 7 ] = data[ 3 ] . astype( str ) + data[ 4 ] . astype( str )
data_1 = data. loc[ : , [ 5 , 7 ] ]
data_2 = data. loc[ : , [ 6 , 7 ] ]
data_2 = data_2. loc[ data_2[ 6 ] != 666 ]
data_2 = data_2. rename( columns= { 6 : 5 } )
data = data_1. append( data_2)
data = data. rename( columns= { 5 : 'type' , 7 : 'money' } )
data[ 'money' ] = data[ 'money' ] . astype( int )
booking_o = data. groupby( 'type' ) . mean( ) . round ( 2 )
booking_o = booking_o. drop( [ '-' ] )
booking_o. to_csv( './数据.csv' )
可视化展示
导包
from flask import Flask, render_template
import pandas as pd
Flask可视化展示
app = Flask( __name__)
@app. route( '/' )
def index ( ) :
data = pd. read_csv( './数据.csv' )
data. columns = [ 'name' , 'value' ]
data = data. loc[ : , [ 'value' , 'name' ] ]
data[ 'value' ] = data[ 'value' ] // 1000
data = data. to_dict( orient= 'records' )
return render_template( '数据.html' , data= data)
if __name__ == '__main__' :
app. run( debug= True )
HTML界面与echart源代码的结合
< !DOCTYPE html>
< html lang= "en" >
< head>
< meta charset= "UTF-8" >
< title> Title< / title>
< / head>
< body>
< !- - - 引入echarts- - - >
< script src= "https://cdn.bootcdn.net/ajax/libs/echarts/5.0.2/echarts.min.js" > < / script>
< div id = "main" style= "width: 800px;height:600px;blackgroup:pink" > < / div>
< script type = "text/javascript" >
var myChart = echarts. init( document. getElementById( 'main' ) ) ;
var option = {
legend: {
top: 'bottom'
} ,
toolbox: {
show: true,
feature: {
mark: { show: true} ,
dataView: { show: true, readOnly: false} ,
restore: { show: true} ,
saveAsImage: { show: true}
}
} ,
series: [
{
name: '面积模式' ,
type : 'pie' ,
radius: [ 50 , 250 ] ,
center: [ '50%' , '50%' ] ,
roseType: 'area' ,
itemStyle: {
borderRadius: 8
} ,
data: { { data| tojson} }
}
]
} ;
myChart. setOption( option) ;
< / script>
< / body>
< / html>