1 Spark和PySpark的介绍
1.1 PySpark
1.1.1 安装PySpark库
pip install pyspark
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pyspark
1.1.2 构建PySpark执行环境入口对象
# 导包
from pyspark import SparkConf, SparkContext
# 创建SparkConf类对象
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
"""
上面这句等价于:
conf = SparkConf()
conf.setMaster("local[*]")
conf.setAppName("test_spark_app")
"""
# 基于SparkConf类对象创建SparkContext类对象
sc = SparkContext(conf=conf)
# 打印pyspark的运行版本
print(sc.version)
# 停止SparkContext类对象的运行(停止pyspark程序)
sc.stop()
1.1.3 PySpark的编程模型
1.1.4 RDD对象
1.1.4.1 将容器转化为RDD对象
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)
rdd1 = sc.parallelize([1,2,3,4,5])
rdd2 = sc.parallelize("12345")
rdd3 = sc.parallelize((1,2,3,4,5))
rdd4 = sc.parallelize({1,2,3,4,5})
rdd5 = sc.parallelize({"name":1,"age":2})
print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())
print(rdd4.collect())
print(rdd5.collect())
[1, 2, 3, 4, 5]
['1', '2', '3', '4', '5']
[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
['name', 'age']
读取文件转RDD对象
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.textFile("C:/Users/18757/Desktop/pythontext/bill.txt") # 文件路径
print(rdd.collect())
['周杰轮,2022-01-01,100000,消费,正式', '周杰轮,2022-01-02,300000,消费,正式', '周杰轮,2022-01-03,100000,消费,测试', '林俊节,2022-01-01,300000,消费,正式', '林俊节,2022-01-02,100000,消费,正式', '林俊节,2022-01-03,100000,消费,测试', '林俊节,2022-01-02,100000,消费,正式']
1.1.4.2 RDD操作
1.1.4.2.1 map算子
from pyspark import SparkConf, SparkContext
# 导入python解释器的位置
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize([1,2,3,4,5])
# 通过map方法将全部数据都乘以10
def func(data):
return data * 10
rdd2 = rdd.map(func).map(lambda x:x+1)
print(rdd2.collect())
[11, 21, 31, 41, 51]
1.1.4.2.2 flatmap算子
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize(["itheima itcast 666", "itheima itheima itcast", "python itheima"])
def func(data):
return data.split(" ")
# 需求:将RDD数据里面的一个个单词提取出来
rdd2 = rdd.map(func)
print(rdd2.collect())
[['itheima', 'itcast', '666'], ['itheima', 'itheima', 'itcast'], ['python', 'itheima']]
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize(["itheima itcast 666", "itheima itheima itcast", "python itheima"])
def func(data):
return data.split(" ")
# 需求:将RDD数据里面的一个个单词提取出来
rdd2 = rdd.flatMap(func)
print(rdd2.collect())
['itheima', 'itcast', '666', 'itheima', 'itheima', 'itcast', 'python', 'itheima']
1.1.4.2.3 reduceByKey算子
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize([('a',1), ('a',1), ('b',1), ('b',1), ('b',1)])
result = rdd.reduceByKey(lambda a, b: a + b)
print(result.collect())
[('b', 3), ('a', 2)]
1.1.4.2.4 练习案例1
将以下文档中,各个单词出现的次数统计出来
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\hello.txt")
word_rdd = rdd.flatMap(lambda x:x.split(" "))
word_with_one_rdd = word_rdd.map(lambda x:(x,1))
result = word_with_one_rdd.reduceByKey(lambda a, b: a + b)
print(result.collect())
[('itcast', 4), ('python', 6), ('itheima', 7), ('spark', 4), ('pyspark', 3)]
1.1.4.2.5 Filter
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize([1, 2, 3, 4, 5])
result = rdd.filter(lambda x:x % 2 == 0)
print(result.collect())
[2, 4]
1.1.4.2.6 distinct算子
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize([1, 1, 3, 3, 5, 5, 6, 6, 6])
rdd = rdd.distinct()
print(rdd.collect())
[1, 3, 5, 6]
1.1.4.2.7 sortBy方法
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\hello.txt")
word_rdd = rdd.flatMap(lambda x:x.split(" "))
word_with_one_rdd = word_rdd.map(lambda x:(x,1))
result = word_with_one_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda x:x[1],False,1)
print(result.collect())
1.1.4.2.8 综合案例
from pyspark import SparkConf,SparkContext
import json
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
# 1 求各个城市销售额,并根据销售额排名
# 1.1 读取文件得到RDDD
file_rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\orders.txt")
# 1.2 取出一个个JSON字符串
JSON_rdd = file_rdd.flatMap(lambda x:x.split("|"))
# ['{"id":1,"timestamp":"2019-05-08T01:03.00Z","category":"平板电脑","areaName":"北京","money":"1450"}', '{"id":2,"timestamp":"2019-05-08T01:01.00Z","category":"手机","areaName":"北京","money":"1450"}', '{"id":3,"timestamp":"2019-05-08T01:03.00Z","category":"手机","areaName":"北京","money":"8412"}', '{"id":4,"timestamp":"2019-05-08T05:01.00Z","category":"电脑","areaName":"上海","money":"1513"}', '{"id":5,"timestamp":"2019-05-08T01:03.00Z","category":"家电","areaName":"北京","money":"1550"}', '{"id":6,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"杭州","money":"1550"}', '{"id":7,"timestamp":"2019-05-08T01:03.00Z","category":"电脑","areaName":"北京","money":"5611"}', '{"id":8,"timestamp":"2019-05-08T03:01.00Z","category":"家电","areaName":"北京","money":"4410"}', '{"id":9,"timestamp":"2019-05-08T01:03.00Z","category":"家具","areaName":"郑州","money":"1120"}', '{"id":10,"timestamp":"2019-05-08T01:01.00Z","category":"家具","areaName":"北京","money":"6661"}', '{"id":11,"timestamp":"2019-05-08T05:03.00Z","category":"家具","areaName":"杭州","money":"1230"}', '{"id":12,"timestamp":"2019-05-08T01:01.00Z","category":"书籍","areaName":"北京","money":"5550"}', '{"id":13,"timestamp":"2019-05-08T01:03.00Z","category":"书籍","areaName":"北京","money":"5550"}', '{"id":14,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"北京","money":"1261"}', '{"id":15,"timestamp":"2019-05-08T03:03.00Z","category":"电脑","areaName":"杭州","money":"6660"}', '{"id":16,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"天津","money":"6660"}', '{"id":17,"timestamp":"2019-05-08T01:03.00Z","category":"书籍","areaName":"北京","money":"9000"}', '{"id":18,"timestamp":"2019-05-08T05:01.00Z","category":"书籍","areaName":"北京","money":"1230"}', '{"id":19,"timestamp":"2019-05-08T01:03.00Z","category":"电脑","areaName":"杭州","money":"5551"}', '{"id":20,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"北京","money":"2450"}', '{"id":21,"timestamp":"2019-05-08T01:03.00Z","category":"食品","areaName":"北京","money":"5520"}', '{"id":22,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"北京","money":"6650"}', '{"id":23,"timestamp":"2019-05-08T01:03.00Z","category":"服饰","areaName":"杭州","money":"1240"}', '{"id":24,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"天津","money":"5600"}', '{"id":25,"timestamp":"2019-05-08T01:03.00Z","category":"食品","areaName":"北京","money":"7801"}', '{"id":26,"timestamp":"2019-05-08T01:01.00Z","category":"服饰","areaName":"北京","money":"9000"}', '{"id":27,"timestamp":"2019-05-08T01:03.00Z","category":"服饰","areaName":"杭州","money":"5600"}', '{"id":28,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"北京","money":"8000"}', '{"id":29,"timestamp":"2019-05-08T02:03.00Z","category":"服饰","areaName":"杭州","money":"7000"}']
# 1.3 将一个个JSON字符串转换为字典
file_dict = JSON_rdd.map(lambda x:json.loads(x))
# [{'id': 1, 'timestamp': '2019-05-08T01:03.00Z', 'category': '平板电脑', 'areaName': '北京', 'money': '1450'}, {'id': 2, 'timestamp': '2019-05-08T01:01.00Z', 'category': '手机', 'areaName': '北京', 'money': '1450'}, {'id': 3, 'timestamp': '2019-05-08T01:03.00Z', 'category': '手机', 'areaName': '北京', 'money': '8412'}, {'id': 4, 'timestamp': '2019-05-08T05:01.00Z', 'category': '电脑', 'areaName': '上海', 'money': '1513'}, {'id': 5, 'timestamp': '2019-05-08T01:03.00Z', 'category': '家电', 'areaName': '北京', 'money': '1550'}, {'id': 6, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '1550'}, {'id': 7, 'timestamp': '2019-05-08T01:03.00Z', 'category': '电脑', 'areaName': '北京', 'money': '5611'}, {'id': 8, 'timestamp': '2019-05-08T03:01.00Z', 'category': '家电', 'areaName': '北京', 'money': '4410'}, {'id': 9, 'timestamp': '2019-05-08T01:03.00Z', 'category': '家具', 'areaName': '郑州', 'money': '1120'}, {'id': 10, 'timestamp': '2019-05-08T01:01.00Z', 'category': '家具', 'areaName': '北京', 'money': '6661'}, {'id': 11, 'timestamp': '2019-05-08T05:03.00Z', 'category': '家具', 'areaName': '杭州', 'money': '1230'}, {'id': 12, 'timestamp': '2019-05-08T01:01.00Z', 'category': '书籍', 'areaName': '北京', 'money': '5550'}, {'id': 13, 'timestamp': '2019-05-08T01:03.00Z', 'category': '书籍', 'areaName': '北京', 'money': '5550'}, {'id': 14, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '北京', 'money': '1261'}, {'id': 15, 'timestamp': '2019-05-08T03:03.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '6660'}, {'id': 16, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '天津', 'money': '6660'}, {'id': 17, 'timestamp': '2019-05-08T01:03.00Z', 'category': '书籍', 'areaName': '北京', 'money': '9000'}, {'id': 18, 'timestamp': '2019-05-08T05:01.00Z', 'category': '书籍', 'areaName': '北京', 'money': '1230'}, {'id': 19, 'timestamp': '2019-05-08T01:03.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '5551'}, {'id': 20, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '北京', 'money': '2450'}, {'id': 21, 'timestamp': '2019-05-08T01:03.00Z', 'category': '食品', 'areaName': '北京', 'money': '5520'}, {'id': 22, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '北京', 'money': '6650'}, {'id': 23, 'timestamp': '2019-05-08T01:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '1240'}, {'id': 24, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '天津', 'money': '5600'}, {'id': 25, 'timestamp': '2019-05-08T01:03.00Z', 'category': '食品', 'areaName': '北京', 'money': '7801'}, {'id': 26, 'timestamp': '2019-05-08T01:01.00Z', 'category': '服饰', 'areaName': '北京', 'money': '9000'}, {'id': 27, 'timestamp': '2019-05-08T01:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '5600'}, {'id': 28, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '北京', 'money': '8000'}, {'id': 29, 'timestamp': '2019-05-08T02:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '7000'}]
# 1.4 取出城市和销售额
city_with_money = file_dict.map(lambda x: (x["areaName"],int(x["money"])))
# [('北京', 1450), ('北京', 1450), ('北京', 8412), ('上海', 1513), ('北京', 1550), ('杭州', 1550), ('北京', 5611), ('北京', 4410), ('郑州', 1120), ('北京', 6661), ('杭州', 1230), ('北京', 5550), ('北京', 5550), ('北京', 1261), ('杭州', 6660), ('天津', 6660), ('北京', 9000), ('北京', 1230), ('杭州', 5551), ('北京', 2450), ('北京', 5520), ('北京', 6650), ('杭州', 1240), ('天津', 5600), ('北京', 7801), ('北京', 9000), ('杭州', 5600), ('北京', 8000), ('杭州', 7000)]
# 1.5 分组聚合各个城市销售额,并根据销售额排名
city_with_money_result = city_with_money.reduceByKey(lambda a, b: a + b)
# [('杭州', 28831), ('天津', 12260), ('北京', 91556), ('上海', 1513), ('郑州', 1120)]
city_with_money_result = city_with_money_result.sortBy(lambda x:x[1],False,1)
print(city_with_money_result.collect())
# 2 全部城市,有哪些商品类别在售卖
city_with_category = file_dict.map(lambda x: (x["areaName"],x["category"]) )
# [('北京', '平板电脑'), ('北京', '手机'), ('北京', '手机'), ('上海', '电脑'), ('北京', '家电'), ('杭州', '电脑'), ('北京', '电脑'), ('北京', '家电'), ('郑州', '家具'), ('北京', '家具'), ('杭州', '家具'), ('北京', '书籍'), ('北京', '书籍'), ('北京', '电脑'), ('杭州', '电脑'), ('天津', '电脑'), ('北京', '书籍'), ('北京', '书籍'), ('杭州', '电脑'), ('北京', '电脑'), ('北京', '食品'), ('北京', '食品'), ('杭州', '服饰'), ('天津', '食品'), ('北京', '食品'), ('北京', '服饰'), ('杭州', '服饰'), ('北京', '食品'), ('杭州', '服饰')]
city_with_category = city_with_category.distinct(1)
def func(a, b):
result = a + "、"
result += b
return result
city_with_category_result = city_with_category.reduceByKey(func)
# [('北京', '平板电脑、手机、家电、电脑、家具、书籍、食品、服饰'), ('上海', '电脑'), ('杭州', '电脑、家具、服饰'), ('郑州', '家具'), ('天津', '电脑、食品')]
print(city_with_category_result.collect())
[('北京', 91556), ('杭州', 28831), ('天津', 12260), ('上海', 1513), ('郑州', 1120)]
[('北京', '平板电脑、手机、家电、电脑、家具、书籍、食品、服饰'), ('上海', '电脑'), ('杭州', '电脑、家具、服饰'), ('郑州', '家具'), ('天津', '电脑、食品')]
1.1.5 将RDD对象转化为Python中的数据输出
1.5.1 数据输出的四个算子
1.5.1.1 reduce算子
1.5.1.2 collect算子
take算子
1.5.1.3 count算子
1.5.1.4 四个算子的综合使用示例
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd = sc.parallelize([1, 1, 3, 3])
print(f"RDD对象的内容为{rdd.collect()}")
print(f"RDD对象中前3个数据为{rdd.take(3)}")
print(f"RDD对象中数据个数为{rdd.count()}")
print(f"RDD对象中数据之和为{rdd.reduce(lambda a, b: a + b)}")
RDD对象的内容为[1, 1, 3, 3]
RDD对象中前3个数据为[1, 1, 3]
RDD对象中数据个数为4
RDD对象中数据之和为8
1.1.6将RDD对象输出到文件中
1.1.6.1 saveAsTextFile算子
以下代码输出到文件中会形成多个分区,多个分区会导致数据输出到多个不同的文件中
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)])
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]])
rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")
若要输出到一个分区,则
1.1.6.1.1 输出一个分区:方式一
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", "1")
sc = SparkContext(conf=conf)
rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)])
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]])
rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")
1.1.6.1.2 输出一个分区:方式二
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", "1")
sc = SparkContext(conf=conf)
rdd1 = sc.parallelize([1, 2, 3, 4, 5],1)
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)],1)
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]],1)
rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")
1.1.7 综合案例
from pyspark import SparkConf,SparkContext
import json
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)
# 从文件中取得rdd对象
rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\13\search_log.txt")
# 将rdd对象每一行都根据换行符分割并转化为一个列表
rdd = rdd.map(lambda data:data.split("\t"))
# 为了给时间排名,取出时间并赋权重1
rdd1 = rdd.map(lambda data:(data[0][:2],1))
# 排名
result1 = rdd1.reduceByKey(lambda a, b: a + b).sortBy(lambda data:data[1],False,1).take(3)
print(result1)
# 热门搜索词
rdd2 = rdd.map(lambda data:(data[2],1))
result2 = rdd2.reduceByKey(lambda a, b: a + b).\
sortBy(lambda data:data[1],False,1).\
take(3)
print(result2)
# 黑马程序员时间段搜索次数排名
result3 = rdd.filter(lambda x:x[2] == '黑马程序员').\
map(lambda data:(data[0][:2],1)).\
reduceByKey(lambda a,b:a+b).\
sortBy(lambda x:x[1],False,1).\
take(1)
print(result3)
# 将数据转化为Json格式,写出为文件
result4 = rdd.map(lambda x:{"time": x[0], "user_id": x[1], "key_word": x[2], "rank1": x[3], "rank2": x[4], "url": x[5]}).\
saveAsTextFile(r"C:\Users\18757\Desktop\pythontext\output\json_search_log")
[('20', 3479), ('23', 3087), ('21', 2989)]
[('scala', 2310), ('hadoop', 2268), ('博学谷', 2002)]
[('22', 245)]
2 闭包
2.1 概念
在函数嵌套的前提下,内部函数使用了外部函数的变量,并且外部函数返回了内部函数,我们把这个使用外部函数变量的内部函数称为闭包。
(1)函数嵌套
(2)内部函数使用外部函数变量
(3)外部函数返回内部函数
(4)内部函数称为闭包
2.2 背景
2.3简单闭包
# 简单闭包
def outer(logo):
def inner(msg):
print(f"<{logo}>{msg}<{logo}>")
return inner
fn1 = outer("黑马程序员") # 返回的是一个inner,取名为fn1,得到一个logo固定的inener
fn1("大家好呀")
fn1("学python就来")
fn2 = outer("传智教育")
fn2("IT职业教育培训")
fn2("学python就来")
<黑马程序员>大家好呀<黑马程序员>
<黑马程序员>学python就来<黑马程序员>
<传智教育>IT职业教育培训<传智教育>
<传智教育>学python就来<传智教育>
2.4 闭包里的重要关键字——nonlocal
使用nolocal关键字可以修改外部函数的值
# 使用nolocal关键字修改外部函数的值
def outer(num1):
def inner(num2):
nonlocal num1 # 只用使用nonlocal 才能修改num1的值,这个方法比把num定义为全局变量要好得多
num1 += num2
print(num1)
return inner
fn = outer(10)
fn(10)
fn(10)
fn(10)
fn(10)
20
30
40
50
2.5 闭包实现——ATM小案例
# 闭包实现——ATM小案例
def account_create(initial_amount=0):
def ATM(num, deposit = True):
nonlocal initial_amount
if deposit:
initial_amount += num
print(f"存款+{num},账户余额:{initial_amount}")
else:
initial_amount -= num
print(f"取款-{num},账户余额:{initial_amount}")
return ATM
fn = account_create()
fn(300)
fn(200)
fn(100,False)
存款+300,账户余额:300
存款+200,账户余额:500
取款-100,账户余额:400
3 设计模式
3.1 概述
定义:设计模式就是一种编程套路。
分类:面向对象、单例模式、工厂模式....等。
3.1.1 单例模式
3.1.1.1 场景
一个类无论获取多少次类对象,都仅仅提供一个具体的实例。
3.1.2 例题
from text_10 import str_tool
s1 = str_tool
s2 = str_tool
print(s1)
print(s2)
3.1.3 优点
(1)节省内存
(2)节省创建对象的开销
3.2 工厂模式
3.2.1 场景
当Worker改名字的时候,下面创建的所有Worker实例(worker1、worker2、worker3)都需要改名字。
class Person:
pass
class Worker(Person):
pass
class Student(Person):
pass
class Teacher(Person):
pass
worker1 = Worker()
worker2 = Worker()
worker3 = Worker()
3.2.2 例题
class Person:
pass
class Worker(Person):
pass
class Student(Person):
pass
class Teacher(Person):
pass
class Factory:# 标签机
def get_person(self, p_type):
# 不同的标签
if p_type == "w":
return Worker()
if p_type == "s":
return Student()
if p_type == "t":
return Teacher()
factory = Factory()# 打开标签机
# 用标签机里的方法给不同产品打上不同的标签
worker = factory.get_person("w")
stu = factory.get_person("s")
teacher = factory.get_person("t")
3.2.3 优点
1.大批量创建对象的时候有统一的入口(Factory),易于代码的维护
2.符合现实世界的模式,即由工厂来制作产品
4 进程和线程
4.1 定义
- 进程就是一个程序。好比一家公司。进程之间的内存隔离。
- 线程好比一家公司的员工,是进程的实际工作者。线程之间的内存共享。
4.2 多...运行
- 多任务运行:操作系统中可以运行多个进程。
- 多线程运行:一个进程内可以运行多个线程。
4.2.1 多线程编程
4.2.1.1 语法
4.2.1.2 例题(一边唱歌,一边跳舞)
一:如何传入方法名
import threading
import time
def dance():
while True:
print("我在跳舞,哔哔哔哔哔")
time.sleep(1)
def sing():
while True:
print("我在唱歌,啦啦啦啦啦")
time.sleep(1)
dance_thread = threading.Thread(target=dance)
sing_thread = threading.Thread(target=sing)
dance_thread.start()
sing_thread.start()
我在跳舞,哔哔哔哔哔
我在唱歌,啦啦啦啦啦
我在唱歌,啦啦啦啦啦我在跳舞,哔哔哔哔哔
我在唱歌,啦啦啦啦啦我在跳舞,哔哔哔哔哔
二:方法需要传参数时,如何构建thread对象
import threading
import time
def dance(msg):
while True:
print(f"{msg}")
time.sleep(1)
def sing(msg):
while True:
print(f"{msg}")
time.sleep(1)
dance_thread = threading.Thread(target=dance,args=("我在跳舞,哔哔哔哔哔", ))
sing_thread = threading.Thread(target=sing,kwargs={"msg":"我在唱歌,啦啦啦啦啦"})
dance_thread.start()
sing_thread.start()
我在跳舞,哔哔哔哔哔
我在唱歌,啦啦啦啦啦
我在唱歌,啦啦啦啦啦
我在跳舞,哔哔哔哔哔
我在跳舞,哔哔哔哔哔我在唱歌,啦啦啦啦啦
4.3 socket(套接字)
定义:负责进程之间的网络数据传输,是数据的搬运工。有socket服务端与socket客户端两类。
socket服务端:
- 等待客户端的连接
- 可以接受发来的消息
- 可以回复消息
socket客户端:
- 向服务端发起连接
- 可以发送消息
- 可以接收回复
4.3.1 socket服务端开发
1.创建服务端对象
import socket socket_server = socket.socket()
2.将服务端对象绑定到指定ip和端口
soket_server.bind((host, port))
3.服务端开始监听端口
socket_server.listen(backlog) # backlog为整数,表示允许的连接数量,超出则等待,不填会自动设置一个合适的值
4.服务端接受客户端连接,获得连接对象
conn, address = socket_server.accept() print(f"接收到客户端连接,来自:{address}") # accept返回的是一个二元元组,可用两个变量接受2元元组的两个元素 # accept是阻塞方法,如果没有连接,会卡在这一行不向下执行代码
5.客户端连接服务端后,接收客户端发送的消息
while True: data = conn.recv(1024).decode('UTF-8') #recv方法的返回值是字节数组(Bytes),可以通过decode使用UTF-8解码为字符串 #recv方法的传参是buffsize,缓冲区大小,一般设置为1024即可 #recv也是阻塞方法 if data == 'exit': break print(f"接收到发送来的数据:{data}") #可以通过while True无限循环来持续和客户端进行数据交互 #可以通过判定客户端发来的特殊标记,如exit,来退出无限循环
6.通过连接对象调用send方法可以回复消息
while True: data = conn.recv(1024).decode('UTF-8') if data == 'exit': break print(f"接收到发送来的数据:{data}") conn.send("你好呀哈哈哈".encode('UTF-8'))
7.conn(客户端当次连接对象)和socket server对象调用close方法
socket客户端服务端构建网址:
https://github.com/nicedayzhu/netAssist/releases
总代码
import socket
socket_server = socket.socket()
socket_server.bind(("localhost", 8888))
socket_server.listen(1)
conn,address = socket_server.accept()
print(f"接收到客户端连接,来自:{address}")
while True:
data = conn.recv(1024).decode("UTF-8")
if data == "exit":
break
print(f"接收到发送来的数据:{data}")
while True:
msg = input("请输入你要和客户端回复的消息:")
if msg == "exit":
break
conn.send(msg.encode("UTF-8"))
conn.close()
socket_server.close()
4.3.2 socket客户端开发
1.创建客户端对象
import socket socket_client = socket.socket()
2.将客户端对象连接到服务端
soket_client.bind((host, port))
3.客户端发送消息
while True: send_msg = input("请输入要发送的信息") if data == 'exit': break socket_client.send(msg.encode("UTF-8"))
4.接受返回消息
while True: data = socket_client.recv(1024).decode('UTF-8') if data == 'exit': break print(f"接收到发送来的数据:{data}")
5.关闭链接
socket_client.close()
import socket
socket_client = socket.socket()
socket_client.connect(("localhost", 8888))
while True:
send_msg = input("请输入您要发送的内容:")
if send_msg == "exit":
break
socket_client.send(send_msg.encode("UTF_8"))
while True:
recv_msg = socket_client.recv(1024).decode("UTF_8")
if recv_msg == "exit":
break
print("服务端回复消息为:",recv_msg)
socket_client.close()
5 正则表达式
定义:又称规则表达式(Reqular Expression),是使用单个字符串来描述、匹配某个句法规则的字符串,常被用来检索、替换那些符合某个模式(规则)的文本。
5.1 re模块的三个基础方法
5.1.1 match
从头匹配,匹配成功返回匹配对象,匹配不成功返回空。
成功:
import re s = "python itheima python itheima python itheima" result = re.match("python", s) print(result) print(result.span()) # 是一个元组,(起始下标,结束下标) print(result.group()) # 返回匹配成功的子串(原串中的)
<re.Match object; span=(0, 6), match='python'> (0, 6) python
不成功:
s = "1python itheima python itheima python itheima" result = re.match("python", s) print(result)
None
5.1.2 search
搜索整个字符串,找出匹配的。从前向后,找到第一个后,就停止,不会继续向后
成功:
s = "1python666itheima666python666" result = re.search("python", s) print(result) print(result.span()) # 是一个元组,(起始下标,结束下标) print(result.group()) # 返回匹配成功的子串(原串中的)
<re.Match object; span=(1, 7), match='python'> (1, 7) python
不成功:
s = 'itheima666' result = re.search("python", s) print(result)
None
5.1.3 findall
匹配整个字符串,找出全部匹配项,找不到返回空list:[]
s = "1python666itheima666python666"
result = re.findall("python", s)
print(result)
result = re.findall("itcast", s)
print(result)
['python', 'python']
[]
5.2 元字符匹配
5.2.1 基础知识
r标记,表转义字符无效
在正则表达式中,千万注意逗号后面不能有多余的空格
5.2.2 应用实例
import re
s = "itheima1 @@python2 !!666 ##itcast3"
# 找出全部数字:
r = r'\d'
print(re.findall(r, s))
# 找出全部英文字母
r = '[a-zA-Z]'
print(re.findall(r, s))
['1', '2', '6', '6', '6', '3']
['i', 't', 'h', 'e', 'i', 'm', 'a', 'p', 'y', 't', 'h', 'o', 'n', 'i', 't', 'c', 'a', 's', 't']
import re
# s = "itheima1 @@python2 !!666 ##itcast3"
# # 找出全部数字:
# r = r'\d'
# print(re.findall(r, s))
# # 找出全部英文字母
# r = '[a-zA-Z]'
# print(re.findall(r, s))
# 1匹配账号,只能由字母和数字组成,长度限制6到10位
print("1---------")
s = 'sakj01234567'
# 体会边界匹配的作用
r1 = '[a-zA-Z0-9]{6,10}'
r2 = '^[a-zA-Z0-9]{6,10}$'
print(re.findall(r1, s))
print(re.findall(r2, s))
# 2匹配QQ号,要求纯数字,长度5-11,第一位不为0
print("2---------")
s1 = '21321213'
s2 = '021321213'
r = r'^[1-9][0-9]{4,10}$'
print(re.findall(r, s1))
print(re.findall(r, s2))
# 3匹配邮箱地址,只允许qq、163、gmail这三种邮箱地址
# abc.efg.daw@qq.com.cn.eu.qq.aa.cc
# abcgq.com
# {内容}.{内容}.{内容}.{内容}.{内容}.{内容}.{内容}.{内容}@{内容}.{内容}.{内容}
print("3---------")
r = r'(^[\w-]+(\.[\w-]+)*@(qq|163|gmail)(\.[\w-]+)+$)'
mail = 'a.b.c.d.e.f.g@qq.com.a.z.c.d.e'
print(re.match(r, mail))
1---------
['sakj012345']
[]
2---------
['21321213']
[]
3---------
<re.Match object; span=(0, 30), match='a.b.c.d.e.f.g@qq.com.a.z.c.d.e'>
6 文件夹递归
def test_os():
"""演示os模块的3个基础方法"""
print(os.listdir(r"C:\Users\18757\Desktop\pythontext\digui")) # 列出路径下的内容
print(os.path.isdir(r"C:\Users\18757\Desktop\pythontext\digui")) # 判断指定路径是不是文件夹
print(os.path.exists(r"C:\Users\18757\Desktop\pythontext\digui")) # 判断指定路径是否存在
# 将指定文件夹里面的所有txt文件取出来
"""
算法思想:
遍历当前文件夹中的文件
(1)若为txt文件,则直接将其将入文件列表
(2)若为文件夹(a),则递归遍历a文件夹中的文件
(2.1)递:母文件夹->子文件夹
(2.2)归:返回当前list的内容并追加入text_list中
"""
import os
def get_txt(path):
text_list = []
if os.path.exists(path):
for file in os.listdir(path):
new_path = path + '/' + file
if os.path.isdir(new_path): # 是文件夹
text_list += get_txt(new_path)
else:
text_list.append(file) # 是文件
return text_list
else:
print(f"指定路径{path}不存在")
return []
result = get_txt("C:/Users/18757/Desktop/pythontext/digui")
print(result)
['1.txt', '2.txt', '3.txt', '1.txt', '2.txt', '2.txt', '3.txt', '2.txt', '3.txt', '2.txt', '3.txt', '2.txt', '3.txt']