一、环境配置
linux环境
java -version
# java version "1.8.0_391"
# Java(TM) SE Runtime Environment (build 1.8.0_391-b13)
# Java HotSpot(TM) 64-Bit Server VM (build 25.391-b13, mixed mode)
mysql --version
# mysql Ver 8.0.35 for Linux on x86_64 (MySQL Community Server - GPL)
ls -l /opt
# 总用量 8
# drwxr-xr-x 28 root root 4096 1月 27 00:48 anaconda3
# drwxr-xr-x 11 root root 227 1月 26 19:23 hadoop-3.3.6
# drwxr-xr-x 5 root root 114 2月 12 12:35 hadoop-snappy-master
# drwxr-xr-x 10 root root 205 2月 12 18:53 hive-3.1.3
# drwxr-xr-x 6 root root 99 2月 12 12:23 maven-3.9.6
# drwxr-xr-x 6 60692 5000 4096 2月 12 12:21 snappy-1.1.1
# drwxrwxrwx 15 root root 235 2月 7 20:40 spark-3.5.0
# drwxrwxrwx. 4 root root 32 2月 11 22:19 tmp
hdfs dfs -ls /
# Found 3 items
# drwxrwxrwx - root supergroup 0 2024-02-21 13:48 /sparklog
# drwxrwxrwx - root supergroup 0 2024-02-17 01:28 /tmp
# drwxrwxrwx - root supergroup 0 2024-02-17 01:04 /user
conda创建并激活虚拟环境
conda create -n pyspark python=3.*.*
conda activate pyspark
python导包
import re
import time
import requests
import pandas as pd
from tqdm import tqdm
from lxml import etree
from pyhive import hive
from pyspark.sql import SparkSession
from pyecharts import charts
from pyecharts import options as opts
二、获取数据
确定目标网址:https://www.hongheiku.com/category/gdjsgdp
获取伪装参数:‘user-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36’
通过requests构造get请求获取单页html数据。
使用xpath解析式解析html标签数据获得省份、GDP列表数据和总页数整数数据。
通过pandas将列表数据转为数据帧dataframe。
# 获取单页数据
def get_page_data(page):
urls = {
'page=1':'https://www.hongheiku.com/category/gdjsgdp',
'page>1':'https://www.hongheiku.com/category/gdjsgdp/page/{}'.format(page)}
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36'}
url = urls['page=1'] if page==1 else urls['page>1']
response = requests.get(url=url,headers=headers)
if response.status_code != 200:
return response.status_code
if response.status_code == 200:
text = response.text
# with open('data.html','w',encoding='utf-8') as file:
# file.write(text)
# with open('data.html','r',encoding='utf-8') as file:
# text = file.read()
element = etree.HTML(text)
id = element.xpath('//tr[@class="even"]/td[@class="column-1"]//center//text()')
pro = element.xpath('