第三阶段Spark

1 Spark和PySpark的介绍

1.1 PySpark

1.1.1 安装PySpark库

pip install pyspark
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple pyspark

1.1.2 构建PySpark执行环境入口对象

# 导包
from pyspark import SparkConf, SparkContext

# 创建SparkConf类对象
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
"""
上面这句等价于:
conf = SparkConf()
conf.setMaster("local[*]")
conf.setAppName("test_spark_app")
"""

# 基于SparkConf类对象创建SparkContext类对象
sc = SparkContext(conf=conf)

# 打印pyspark的运行版本
print(sc.version)

# 停止SparkContext类对象的运行(停止pyspark程序)
sc.stop()

1.1.3 PySpark的编程模型

1.1.4 RDD对象

1.1.4.1 将容器转化为RDD对象

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)

rdd1 = sc.parallelize([1,2,3,4,5])
rdd2 = sc.parallelize("12345")
rdd3 = sc.parallelize((1,2,3,4,5))
rdd4 = sc.parallelize({1,2,3,4,5})
rdd5 = sc.parallelize({"name":1,"age":2})

print(rdd1.collect())
print(rdd2.collect())
print(rdd3.collect())
print(rdd4.collect())
print(rdd5.collect())

[1, 2, 3, 4, 5]
['1', '2', '3', '4', '5']
[1, 2, 3, 4, 5]
[1, 2, 3, 4, 5]
['name', 'age']

读取文件转RDD对象

from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.textFile("C:/Users/18757/Desktop/pythontext/bill.txt") # 文件路径
print(rdd.collect())
['周杰轮,2022-01-01,100000,消费,正式', '周杰轮,2022-01-02,300000,消费,正式', '周杰轮,2022-01-03,100000,消费,测试', '林俊节,2022-01-01,300000,消费,正式', '林俊节,2022-01-02,100000,消费,正式', '林俊节,2022-01-03,100000,消费,测试', '林俊节,2022-01-02,100000,消费,正式']

1.1.4.2 RDD操作
1.1.4.2.1 map算子

from pyspark import SparkConf, SparkContext
# 导入python解释器的位置
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"

conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1,2,3,4,5])
# 通过map方法将全部数据都乘以10

def func(data):
    return data * 10
rdd2 = rdd.map(func).map(lambda x:x+1)

print(rdd2.collect())
[11, 21, 31, 41, 51]

  

1.1.4.2.2 flatmap算子

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize(["itheima itcast 666", "itheima itheima itcast", "python itheima"])
def func(data):
    return data.split(" ")
# 需求:将RDD数据里面的一个个单词提取出来

rdd2 = rdd.map(func)
print(rdd2.collect())

[['itheima', 'itcast', '666'], ['itheima', 'itheima', 'itcast'], ['python', 'itheima']]
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("test_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize(["itheima itcast 666", "itheima itheima itcast", "python itheima"])
def func(data):
    return data.split(" ")
# 需求:将RDD数据里面的一个个单词提取出来

rdd2 = rdd.flatMap(func)
print(rdd2.collect())

['itheima', 'itcast', '666', 'itheima', 'itheima', 'itcast', 'python', 'itheima']
1.1.4.2.3 reduceByKey算子

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([('a',1), ('a',1), ('b',1), ('b',1), ('b',1)])
result = rdd.reduceByKey(lambda a, b: a + b)
print(result.collect())
[('b', 3), ('a', 2)]
1.1.4.2.4 练习案例1

将以下文档中,各个单词出现的次数统计出来

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\hello.txt")
word_rdd = rdd.flatMap(lambda x:x.split(" "))
word_with_one_rdd = word_rdd.map(lambda x:(x,1))
result = word_with_one_rdd.reduceByKey(lambda a, b: a + b)

print(result.collect())
[('itcast', 4), ('python', 6), ('itheima', 7), ('spark', 4), ('pyspark', 3)]
1.1.4.2.5 Filter

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 2, 3, 4, 5])
result = rdd.filter(lambda x:x % 2 == 0)
print(result.collect())
[2, 4]

1.1.4.2.6 distinct算子

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 1, 3, 3, 5, 5, 6, 6, 6])
rdd = rdd.distinct()
print(rdd.collect())
[1, 3, 5, 6]
1.1.4.2.7 sortBy方法

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\hello.txt")
word_rdd = rdd.flatMap(lambda x:x.split(" "))
word_with_one_rdd = word_rdd.map(lambda x:(x,1))
result = word_with_one_rdd.reduceByKey(lambda a, b: a + b).sortBy(lambda x:x[1],False,1)
print(result.collect())

1.1.4.2.8 综合案例

from pyspark import SparkConf,SparkContext
import json
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

# 1 求各个城市销售额,并根据销售额排名
# 1.1 读取文件得到RDDD
file_rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\3\orders.txt")
# 1.2 取出一个个JSON字符串
JSON_rdd = file_rdd.flatMap(lambda x:x.split("|"))
# ['{"id":1,"timestamp":"2019-05-08T01:03.00Z","category":"平板电脑","areaName":"北京","money":"1450"}', '{"id":2,"timestamp":"2019-05-08T01:01.00Z","category":"手机","areaName":"北京","money":"1450"}', '{"id":3,"timestamp":"2019-05-08T01:03.00Z","category":"手机","areaName":"北京","money":"8412"}', '{"id":4,"timestamp":"2019-05-08T05:01.00Z","category":"电脑","areaName":"上海","money":"1513"}', '{"id":5,"timestamp":"2019-05-08T01:03.00Z","category":"家电","areaName":"北京","money":"1550"}', '{"id":6,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"杭州","money":"1550"}', '{"id":7,"timestamp":"2019-05-08T01:03.00Z","category":"电脑","areaName":"北京","money":"5611"}', '{"id":8,"timestamp":"2019-05-08T03:01.00Z","category":"家电","areaName":"北京","money":"4410"}', '{"id":9,"timestamp":"2019-05-08T01:03.00Z","category":"家具","areaName":"郑州","money":"1120"}', '{"id":10,"timestamp":"2019-05-08T01:01.00Z","category":"家具","areaName":"北京","money":"6661"}', '{"id":11,"timestamp":"2019-05-08T05:03.00Z","category":"家具","areaName":"杭州","money":"1230"}', '{"id":12,"timestamp":"2019-05-08T01:01.00Z","category":"书籍","areaName":"北京","money":"5550"}', '{"id":13,"timestamp":"2019-05-08T01:03.00Z","category":"书籍","areaName":"北京","money":"5550"}', '{"id":14,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"北京","money":"1261"}', '{"id":15,"timestamp":"2019-05-08T03:03.00Z","category":"电脑","areaName":"杭州","money":"6660"}', '{"id":16,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"天津","money":"6660"}', '{"id":17,"timestamp":"2019-05-08T01:03.00Z","category":"书籍","areaName":"北京","money":"9000"}', '{"id":18,"timestamp":"2019-05-08T05:01.00Z","category":"书籍","areaName":"北京","money":"1230"}', '{"id":19,"timestamp":"2019-05-08T01:03.00Z","category":"电脑","areaName":"杭州","money":"5551"}', '{"id":20,"timestamp":"2019-05-08T01:01.00Z","category":"电脑","areaName":"北京","money":"2450"}', '{"id":21,"timestamp":"2019-05-08T01:03.00Z","category":"食品","areaName":"北京","money":"5520"}', '{"id":22,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"北京","money":"6650"}', '{"id":23,"timestamp":"2019-05-08T01:03.00Z","category":"服饰","areaName":"杭州","money":"1240"}', '{"id":24,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"天津","money":"5600"}', '{"id":25,"timestamp":"2019-05-08T01:03.00Z","category":"食品","areaName":"北京","money":"7801"}', '{"id":26,"timestamp":"2019-05-08T01:01.00Z","category":"服饰","areaName":"北京","money":"9000"}', '{"id":27,"timestamp":"2019-05-08T01:03.00Z","category":"服饰","areaName":"杭州","money":"5600"}', '{"id":28,"timestamp":"2019-05-08T01:01.00Z","category":"食品","areaName":"北京","money":"8000"}', '{"id":29,"timestamp":"2019-05-08T02:03.00Z","category":"服饰","areaName":"杭州","money":"7000"}']

# 1.3 将一个个JSON字符串转换为字典
file_dict = JSON_rdd.map(lambda x:json.loads(x))
# [{'id': 1, 'timestamp': '2019-05-08T01:03.00Z', 'category': '平板电脑', 'areaName': '北京', 'money': '1450'}, {'id': 2, 'timestamp': '2019-05-08T01:01.00Z', 'category': '手机', 'areaName': '北京', 'money': '1450'}, {'id': 3, 'timestamp': '2019-05-08T01:03.00Z', 'category': '手机', 'areaName': '北京', 'money': '8412'}, {'id': 4, 'timestamp': '2019-05-08T05:01.00Z', 'category': '电脑', 'areaName': '上海', 'money': '1513'}, {'id': 5, 'timestamp': '2019-05-08T01:03.00Z', 'category': '家电', 'areaName': '北京', 'money': '1550'}, {'id': 6, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '1550'}, {'id': 7, 'timestamp': '2019-05-08T01:03.00Z', 'category': '电脑', 'areaName': '北京', 'money': '5611'}, {'id': 8, 'timestamp': '2019-05-08T03:01.00Z', 'category': '家电', 'areaName': '北京', 'money': '4410'}, {'id': 9, 'timestamp': '2019-05-08T01:03.00Z', 'category': '家具', 'areaName': '郑州', 'money': '1120'}, {'id': 10, 'timestamp': '2019-05-08T01:01.00Z', 'category': '家具', 'areaName': '北京', 'money': '6661'}, {'id': 11, 'timestamp': '2019-05-08T05:03.00Z', 'category': '家具', 'areaName': '杭州', 'money': '1230'}, {'id': 12, 'timestamp': '2019-05-08T01:01.00Z', 'category': '书籍', 'areaName': '北京', 'money': '5550'}, {'id': 13, 'timestamp': '2019-05-08T01:03.00Z', 'category': '书籍', 'areaName': '北京', 'money': '5550'}, {'id': 14, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '北京', 'money': '1261'}, {'id': 15, 'timestamp': '2019-05-08T03:03.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '6660'}, {'id': 16, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '天津', 'money': '6660'}, {'id': 17, 'timestamp': '2019-05-08T01:03.00Z', 'category': '书籍', 'areaName': '北京', 'money': '9000'}, {'id': 18, 'timestamp': '2019-05-08T05:01.00Z', 'category': '书籍', 'areaName': '北京', 'money': '1230'}, {'id': 19, 'timestamp': '2019-05-08T01:03.00Z', 'category': '电脑', 'areaName': '杭州', 'money': '5551'}, {'id': 20, 'timestamp': '2019-05-08T01:01.00Z', 'category': '电脑', 'areaName': '北京', 'money': '2450'}, {'id': 21, 'timestamp': '2019-05-08T01:03.00Z', 'category': '食品', 'areaName': '北京', 'money': '5520'}, {'id': 22, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '北京', 'money': '6650'}, {'id': 23, 'timestamp': '2019-05-08T01:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '1240'}, {'id': 24, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '天津', 'money': '5600'}, {'id': 25, 'timestamp': '2019-05-08T01:03.00Z', 'category': '食品', 'areaName': '北京', 'money': '7801'}, {'id': 26, 'timestamp': '2019-05-08T01:01.00Z', 'category': '服饰', 'areaName': '北京', 'money': '9000'}, {'id': 27, 'timestamp': '2019-05-08T01:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '5600'}, {'id': 28, 'timestamp': '2019-05-08T01:01.00Z', 'category': '食品', 'areaName': '北京', 'money': '8000'}, {'id': 29, 'timestamp': '2019-05-08T02:03.00Z', 'category': '服饰', 'areaName': '杭州', 'money': '7000'}]

# 1.4 取出城市和销售额
city_with_money = file_dict.map(lambda x: (x["areaName"],int(x["money"])))
# [('北京', 1450), ('北京', 1450), ('北京', 8412), ('上海', 1513), ('北京', 1550), ('杭州', 1550), ('北京', 5611), ('北京', 4410), ('郑州', 1120), ('北京', 6661), ('杭州', 1230), ('北京', 5550), ('北京', 5550), ('北京', 1261), ('杭州', 6660), ('天津', 6660), ('北京', 9000), ('北京', 1230), ('杭州', 5551), ('北京', 2450), ('北京', 5520), ('北京', 6650), ('杭州', 1240), ('天津', 5600), ('北京', 7801), ('北京', 9000), ('杭州', 5600), ('北京', 8000), ('杭州', 7000)]

# 1.5 分组聚合各个城市销售额,并根据销售额排名
city_with_money_result = city_with_money.reduceByKey(lambda a, b: a + b)
# [('杭州', 28831), ('天津', 12260), ('北京', 91556), ('上海', 1513), ('郑州', 1120)]
city_with_money_result = city_with_money_result.sortBy(lambda x:x[1],False,1)
print(city_with_money_result.collect())


# 2 全部城市,有哪些商品类别在售卖
city_with_category = file_dict.map(lambda x: (x["areaName"],x["category"]) )
# [('北京', '平板电脑'), ('北京', '手机'), ('北京', '手机'), ('上海', '电脑'), ('北京', '家电'), ('杭州', '电脑'), ('北京', '电脑'), ('北京', '家电'), ('郑州', '家具'), ('北京', '家具'), ('杭州', '家具'), ('北京', '书籍'), ('北京', '书籍'), ('北京', '电脑'), ('杭州', '电脑'), ('天津', '电脑'), ('北京', '书籍'), ('北京', '书籍'), ('杭州', '电脑'), ('北京', '电脑'), ('北京', '食品'), ('北京', '食品'), ('杭州', '服饰'), ('天津', '食品'), ('北京', '食品'), ('北京', '服饰'), ('杭州', '服饰'), ('北京', '食品'), ('杭州', '服饰')]
city_with_category = city_with_category.distinct(1)
def func(a, b):
    result = a + "、"
    result += b
    return result
city_with_category_result = city_with_category.reduceByKey(func)
# [('北京', '平板电脑、手机、家电、电脑、家具、书籍、食品、服饰'), ('上海', '电脑'), ('杭州', '电脑、家具、服饰'), ('郑州', '家具'), ('天津', '电脑、食品')]
print(city_with_category_result.collect())




[('北京', 91556), ('杭州', 28831), ('天津', 12260), ('上海', 1513), ('郑州', 1120)]
[('北京', '平板电脑、手机、家电、电脑、家具、书籍、食品、服饰'), ('上海', '电脑'), ('杭州', '电脑、家具、服饰'), ('郑州', '家具'), ('天津', '电脑、食品')]

1.1.5 将RDD对象转化为Python中的数据输出

1.5.1 数据输出的四个算子

1.5.1.1 reduce算子

1.5.1.2 collect算子

take算子

1.5.1.3 count算子

1.5.1.4 四个算子的综合使用示例 
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd = sc.parallelize([1, 1, 3, 3])

print(f"RDD对象的内容为{rdd.collect()}")
print(f"RDD对象中前3个数据为{rdd.take(3)}")
print(f"RDD对象中数据个数为{rdd.count()}")
print(f"RDD对象中数据之和为{rdd.reduce(lambda a, b: a + b)}")
RDD对象的内容为[1, 1, 3, 3]
RDD对象中前3个数据为[1, 1, 3]
RDD对象中数据个数为4
RDD对象中数据之和为8

1.1.6将RDD对象输出到文件中

1.1.6.1 saveAsTextFile算子

以下代码输出到文件中会形成多个分区,多个分区会导致数据输出到多个不同的文件中

from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
sc = SparkContext(conf=conf)

rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)])
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]])

rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")

若要输出到一个分区,则

1.1.6.1.1 输出一个分区:方式一
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", "1")
sc = SparkContext(conf=conf)

rdd1 = sc.parallelize([1, 2, 3, 4, 5])
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)])
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]])

rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")

1.1.6.1.2 输出一个分区:方式二
from pyspark import SparkConf,SparkContext
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
os.environ['HADOOP_HOME'] = r"D:\hadoop-3.0.0"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", "1")
sc = SparkContext(conf=conf)

rdd1 = sc.parallelize([1, 2, 3, 4, 5],1)
rdd2 = sc.parallelize([("Hello",3), ("Spark", 5), ("Hi", 7)],1)
rdd3 = sc.parallelize([[1, 3, 5], [6, 7, 9], [11, 13, 11]],1)

rdd1.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output1")
rdd2.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output2")
rdd3.saveAsTextFile("C:/Users/18757/Desktop/pythontext/output/output3")

1.1.7 综合案例

from pyspark import SparkConf,SparkContext
import json
import os
os.environ['PYSPARK_PYTHON'] = r"D:\dev\python\python3.10.4\python.exe"
conf = SparkConf().setMaster("local[*]").setAppName("text_spark_app")
conf.set("spark.default.parallelism", 1)
sc = SparkContext(conf=conf)

# 从文件中取得rdd对象
rdd = sc.textFile(r"C:\Users\18757\Desktop\pythontext\13\search_log.txt")
# 将rdd对象每一行都根据换行符分割并转化为一个列表
rdd = rdd.map(lambda data:data.split("\t"))

# 为了给时间排名,取出时间并赋权重1
rdd1 = rdd.map(lambda data:(data[0][:2],1))
# 排名
result1 = rdd1.reduceByKey(lambda a, b: a + b).sortBy(lambda data:data[1],False,1).take(3)
print(result1)

# 热门搜索词
rdd2 = rdd.map(lambda data:(data[2],1))
result2 = rdd2.reduceByKey(lambda a, b: a + b).\
    sortBy(lambda data:data[1],False,1).\
    take(3)
print(result2)

# 黑马程序员时间段搜索次数排名
result3 = rdd.filter(lambda x:x[2] == '黑马程序员').\
    map(lambda data:(data[0][:2],1)).\
    reduceByKey(lambda a,b:a+b).\
    sortBy(lambda x:x[1],False,1).\
    take(1)
print(result3)

# 将数据转化为Json格式,写出为文件
result4 = rdd.map(lambda x:{"time": x[0], "user_id": x[1], "key_word": x[2], "rank1": x[3], "rank2": x[4], "url": x[5]}).\
    saveAsTextFile(r"C:\Users\18757\Desktop\pythontext\output\json_search_log")
[('20', 3479), ('23', 3087), ('21', 2989)]
[('scala', 2310), ('hadoop', 2268), ('博学谷', 2002)]
[('22', 245)]

2 闭包

2.1 概念

函数嵌套的前提下,内部函数使用了外部函数的变量,并且外部函数返回了内部函数,我们把这个使用外部函数变量的内部函数称为闭包。

(1)函数嵌套

(2)内部函数使用外部函数变量

(3)外部函数返回内部函数

(4)内部函数称为闭包

2.2 背景

2.3简单闭包

# 简单闭包
def outer(logo):
    def inner(msg):
        print(f"<{logo}>{msg}<{logo}>")

    return inner


fn1 = outer("黑马程序员") # 返回的是一个inner,取名为fn1,得到一个logo固定的inener
fn1("大家好呀")
fn1("学python就来")

fn2 = outer("传智教育")
fn2("IT职业教育培训")
fn2("学python就来")
<黑马程序员>大家好呀<黑马程序员>
<黑马程序员>学python就来<黑马程序员>
<传智教育>IT职业教育培训<传智教育>
<传智教育>学python就来<传智教育>

 2.4 闭包里的重要关键字——nonlocal

使用nolocal关键字可以修改外部函数的值

# 使用nolocal关键字修改外部函数的值
def outer(num1):
    def inner(num2):
        nonlocal num1  # 只用使用nonlocal 才能修改num1的值,这个方法比把num定义为全局变量要好得多
        num1 += num2
        print(num1)

    return inner


fn = outer(10)
fn(10)
fn(10)
fn(10)
fn(10)
20
30
40
50

2.5 闭包实现——ATM小案例

# 闭包实现——ATM小案例
def account_create(initial_amount=0):
    def ATM(num, deposit = True):
        nonlocal initial_amount
        if deposit:
            initial_amount += num
            print(f"存款+{num},账户余额:{initial_amount}")
        else:
            initial_amount -= num
            print(f"取款-{num},账户余额:{initial_amount}")
    return ATM
fn = account_create()
fn(300)
fn(200)
fn(100,False)
存款+300,账户余额:300
存款+200,账户余额:500
取款-100,账户余额:400

3 设计模式

3.1 概述

定义:设计模式就是一种编程套路。

分类:面向对象、单例模式、工厂模式....等。

3.1.1 单例模式

3.1.1.1 场景

一个类无论获取多少次类对象,都仅仅提供一个具体的实例

3.1.2 例题

from text_10 import str_tool

s1 = str_tool
s2 = str_tool
print(s1)
print(s2)

3.1.3 优点

(1)节省内存

(2)节省创建对象的开销

3.2 工厂模式

3.2.1 场景

当Worker改名字的时候,下面创建的所有Worker实例(worker1、worker2、worker3)都需要改名字。

class Person:
    pass

class Worker(Person):
    pass
class Student(Person):
    pass
class Teacher(Person):
    pass

worker1 = Worker()
worker2 = Worker()
worker3 = Worker()

3.2.2 例题
class Person:
    pass

class Worker(Person):
    pass
class Student(Person):
    pass
class Teacher(Person):
    pass

class Factory:# 标签机
    def get_person(self, p_type):
        # 不同的标签
        if p_type == "w":
            return Worker()
        if p_type == "s":
            return Student()
        if p_type == "t":
            return Teacher()

factory = Factory()# 打开标签机
# 用标签机里的方法给不同产品打上不同的标签
worker = factory.get_person("w")
stu = factory.get_person("s")
teacher = factory.get_person("t")

3.2.3 优点

1.大批量创建对象的时候有统一的入口(Factory),易于代码的维护

2.符合现实世界的模式,即由工厂来制作产品

4 进程和线程

4.1 定义

  • 进程就是一个程序。好比一家公司。进程之间的内存隔离。
  • 线程好比一家公司的员工,是进程的实际工作者。线程之间的内存共享。

4.2 多...运行

  • 多任务运行:操作系统中可以运行多个进程。
  • 多线程运行:一个进程内可以运行多个线程。

4.2.1 多线程编程

4.2.1.1 语法

4.2.1.2 例题(一边唱歌,一边跳舞)

一:如何传入方法名
import threading
import time

def dance():
    while True:
        print("我在跳舞,哔哔哔哔哔")
        time.sleep(1)

def sing():
    while True:
        print("我在唱歌,啦啦啦啦啦")
        time.sleep(1)

dance_thread = threading.Thread(target=dance)
sing_thread = threading.Thread(target=sing)

dance_thread.start()
sing_thread.start()
我在跳舞,哔哔哔哔哔
我在唱歌,啦啦啦啦啦
我在唱歌,啦啦啦啦啦我在跳舞,哔哔哔哔哔

我在唱歌,啦啦啦啦啦我在跳舞,哔哔哔哔哔

二:方法需要传参数时,如何构建thread对象
import threading
import time

def dance(msg):
    while True:
        print(f"{msg}")
        time.sleep(1)

def sing(msg):
    while True:
        print(f"{msg}")
        time.sleep(1)

dance_thread = threading.Thread(target=dance,args=("我在跳舞,哔哔哔哔哔", ))
sing_thread = threading.Thread(target=sing,kwargs={"msg":"我在唱歌,啦啦啦啦啦"})

dance_thread.start()
sing_thread.start()
我在跳舞,哔哔哔哔哔
我在唱歌,啦啦啦啦啦
我在唱歌,啦啦啦啦啦
我在跳舞,哔哔哔哔哔
我在跳舞,哔哔哔哔哔我在唱歌,啦啦啦啦啦

4.3 socket(套接字)

定义:负责进程之间的网络数据传输,是数据的搬运工。有socket服务端与socket客户端两类。

socket服务端:

  1. 等待客户端的连接
  2. 可以接受发来的消息
  3. 可以回复消息

socket客户端:

  1. 向服务端发起连接
  2. 可以发送消息
  3. 可以接收回复

4.3.1 socket服务端开发

1.创建服务端对象

import socket
socket_server = socket.socket()

2.将服务端对象绑定到指定ip和端口

soket_server.bind((host, port))

3.服务端开始监听端口

socket_server.listen(backlog)
# backlog为整数,表示允许的连接数量,超出则等待,不填会自动设置一个合适的值

4.服务端接受客户端连接,获得连接对象

conn, address = socket_server.accept()
print(f"接收到客户端连接,来自:{address}")
# accept返回的是一个二元元组,可用两个变量接受2元元组的两个元素
# accept是阻塞方法,如果没有连接,会卡在这一行不向下执行代码

5.客户端连接服务端后,接收客户端发送的消息

while True:
    data = conn.recv(1024).decode('UTF-8')
    #recv方法的返回值是字节数组(Bytes),可以通过decode使用UTF-8解码为字符串
    #recv方法的传参是buffsize,缓冲区大小,一般设置为1024即可
    #recv也是阻塞方法
    if data == 'exit':
        break
    print(f"接收到发送来的数据:{data}")
    #可以通过while True无限循环来持续和客户端进行数据交互
    #可以通过判定客户端发来的特殊标记,如exit,来退出无限循环

6.通过连接对象调用send方法可以回复消息

while True:
    data = conn.recv(1024).decode('UTF-8')
    if data == 'exit':
        break
    print(f"接收到发送来的数据:{data}")
    
    conn.send("你好呀哈哈哈".encode('UTF-8'))

7.conn(客户端当次连接对象)和socket server对象调用close方法

socket客户端服务端构建网址:

https://github.com/nicedayzhu/netAssist/releases

总代码

import socket
socket_server = socket.socket()

socket_server.bind(("localhost", 8888))

socket_server.listen(1)

conn,address = socket_server.accept()
print(f"接收到客户端连接,来自:{address}")

while True:
    data = conn.recv(1024).decode("UTF-8")
    if data == "exit":
        break
    print(f"接收到发送来的数据:{data}")

while True:
    msg = input("请输入你要和客户端回复的消息:")
    if msg == "exit":
        break
    conn.send(msg.encode("UTF-8"))
conn.close()
socket_server.close()

 4.3.2 socket客户端开发

 1.创建客户端对象

import socket
socket_client = socket.socket()

2.将客户端对象连接到服务端

soket_client.bind((host, port))

3.客户端发送消息

while True:    
    send_msg = input("请输入要发送的信息")
    if data == 'exit':
        break
    
    socket_client.send(msg.encode("UTF-8"))

4.接受返回消息

while True:
    data = socket_client.recv(1024).decode('UTF-8')
    if data == 'exit':
        break
    print(f"接收到发送来的数据:{data}")

5.关闭链接

socket_client.close()
import socket
socket_client = socket.socket()

socket_client.connect(("localhost", 8888))

while True:
    send_msg = input("请输入您要发送的内容:")
    if send_msg == "exit":
        break
    socket_client.send(send_msg.encode("UTF_8"))

while True:
    recv_msg = socket_client.recv(1024).decode("UTF_8")
    if recv_msg == "exit":
        break
    print("服务端回复消息为:",recv_msg)

socket_client.close()

5 正则表达式

定义:又称规则表达式(Reqular Expression),是使用单个字符串来描述、匹配某个句法规则的字符串,常被用来检索、替换那些符合某个模式(规则)的文本。

5.1 re模块的三个基础方法

5.1.1 match

从头匹配,匹配成功返回匹配对象,匹配不成功返回空。

成功:

import re

s = "python itheima python itheima python itheima"
result = re.match("python", s)
print(result)
print(result.span())    # 是一个元组,(起始下标,结束下标)
print(result.group())   # 返回匹配成功的子串(原串中的)
<re.Match object; span=(0, 6), match='python'>
(0, 6)
python

不成功:

s = "1python itheima python itheima python itheima"
result = re.match("python", s)
print(result)
None

5.1.2 search

搜索整个字符串,找出匹配的。从前向后,找到第一个后,就停止,不会继续向后

 成功:

s = "1python666itheima666python666"
result = re.search("python", s)
print(result)
print(result.span())    # 是一个元组,(起始下标,结束下标)
print(result.group())   # 返回匹配成功的子串(原串中的)
<re.Match object; span=(1, 7), match='python'>
(1, 7)
python

不成功:

s = 'itheima666'
result = re.search("python", s)
print(result)
None

5.1.3 findall

匹配整个字符串,找出全部匹配项,找不到返回空list:[]

s = "1python666itheima666python666"
result = re.findall("python", s)
print(result)

result = re.findall("itcast", s)
print(result)
['python', 'python']
[]

5.2 元字符匹配

5.2.1 基础知识

 

r标记,表转义字符无效

在正则表达式中,千万注意逗号后面不能有多余的空格

5.2.2 应用实例

import re
s = "itheima1 @@python2 !!666 ##itcast3"
# 找出全部数字:
r = r'\d'
print(re.findall(r, s))
# 找出全部英文字母
r = '[a-zA-Z]'
print(re.findall(r, s))
['1', '2', '6', '6', '6', '3']
['i', 't', 'h', 'e', 'i', 'm', 'a', 'p', 'y', 't', 'h', 'o', 'n', 'i', 't', 'c', 'a', 's', 't']

import re

# s = "itheima1 @@python2 !!666 ##itcast3"
# # 找出全部数字:
# r = r'\d'
# print(re.findall(r, s))
# # 找出全部英文字母
# r = '[a-zA-Z]'
# print(re.findall(r, s))

# 1匹配账号,只能由字母和数字组成,长度限制6到10位
print("1---------")
s = 'sakj01234567'
# 体会边界匹配的作用
r1 = '[a-zA-Z0-9]{6,10}'
r2 = '^[a-zA-Z0-9]{6,10}$'
print(re.findall(r1, s))
print(re.findall(r2, s))
# 2匹配QQ号,要求纯数字,长度5-11,第一位不为0
print("2---------")
s1 = '21321213'
s2 = '021321213'
r = r'^[1-9][0-9]{4,10}$'
print(re.findall(r, s1))
print(re.findall(r, s2))
# 3匹配邮箱地址,只允许qq、163、gmail这三种邮箱地址
# abc.efg.daw@qq.com.cn.eu.qq.aa.cc
# abcgq.com
# {内容}.{内容}.{内容}.{内容}.{内容}.{内容}.{内容}.{内容}@{内容}.{内容}.{内容}
print("3---------")
r = r'(^[\w-]+(\.[\w-]+)*@(qq|163|gmail)(\.[\w-]+)+$)'
mail = 'a.b.c.d.e.f.g@qq.com.a.z.c.d.e'
print(re.match(r, mail))
1---------
['sakj012345']
[]
2---------
['21321213']
[]
3---------
<re.Match object; span=(0, 30), match='a.b.c.d.e.f.g@qq.com.a.z.c.d.e'>

6 文件夹递归

def test_os():
    """演示os模块的3个基础方法"""
    print(os.listdir(r"C:\Users\18757\Desktop\pythontext\digui"))  # 列出路径下的内容
    print(os.path.isdir(r"C:\Users\18757\Desktop\pythontext\digui"))  # 判断指定路径是不是文件夹
    print(os.path.exists(r"C:\Users\18757\Desktop\pythontext\digui"))  # 判断指定路径是否存在
# 将指定文件夹里面的所有txt文件取出来
"""
    算法思想:
        遍历当前文件夹中的文件
        (1)若为txt文件,则直接将其将入文件列表
        (2)若为文件夹(a),则递归遍历a文件夹中的文件
            (2.1)递:母文件夹->子文件夹
            (2.2)归:返回当前list的内容并追加入text_list中
"""
import os


def get_txt(path):
    text_list = []
    if os.path.exists(path):
        for file in os.listdir(path):
            new_path = path + '/' + file
            if os.path.isdir(new_path):  # 是文件夹
                text_list += get_txt(new_path)
            else:
                text_list.append(file)  # 是文件
        return text_list
    else:
        print(f"指定路径{path}不存在")
        return []


result = get_txt("C:/Users/18757/Desktop/pythontext/digui")
print(result)
['1.txt', '2.txt', '3.txt', '1.txt', '2.txt', '2.txt', '3.txt', '2.txt', '3.txt', '2.txt', '3.txt', '2.txt', '3.txt']

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值