文章目录
23.3.8
(数据 + 文件处理)
从给定的5个招聘信息数据文件(附件data.rar文件中)中提取出所有的电话号码和邮箱地址,并将数据保存到results.txt文件中
"""
Finished by Lcy
2023.3.6
"""
import re
Pattern_Mail_com = "(.*.com)"
Pattern_Mail_cn = "(.*.cn)"
Pattern_Phone_A = "(\d{11})"
Pattern_Phone_B = "(\d{3}-\d{8})"
def Use(Pattern , file):
Data_Mail = re.findall(Pattern, file)
for x in Data_Mail:
t.write(x)
t.write('\n')
if __name__ == "__main__":
for i in range(1,6):
path = "岗位" + str(i) + ".txt"
data = open(path , 'r' , encoding = "utf-8")
t = open("results.txt" , 'a') #追加方式打开
t.write("这是岗位" + str(i) + "的信息:")
t.write('\n')
for file in data.readlines():
Use(Pattern_Mail_com,file)
data.seek(0)
for file in data.readlines():
Use(Pattern_Mail_cn,file)
data.seek(0)
for file in data.readlines():
Use(Pattern_Phone_A,file)
data.seek(0)
for file in data.readlines():
Use(Pattern_Phone_B,file)
data.seek(0)
t.write('\n')
t.close() #打开后记得关闭
23.3.16
爬虫
# -*- coding: utf-8 -*-
"""
Published By Li
"""
from pybloom_live import ScalableBloomFilter
#布隆过滤器(数据结构) , 比map快
import requests , re
bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)
pageurl = []
# header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}
if __name__ == "__main__":
seed = "https://www.ytu.edu.cn/ydxw/xxyw.htm"
cnt = 10
while(cnt):
if seed in bloom:
continue
bloom.add(seed)
pageurl.append(seed)
# 忽略安全证书验证会产生警告 , 用这个语句取去省去警告
requests.packages.urllib3.disable_warnings()
r = requests.get(seed , timeout = 50 , verify = False) # 你需要的网址
#verify = False 忽略https安全证书的验证
r.raise_for_status() # 异常触发语句
r.encoding = r.apparent_encoding # 确定编码语句
pattern = "<span class=\"p_next p_fun\"><a href=\"(.*)\">下页</a></span>"
newsInfo = re.findall(pattern, r.text )
seed = "https://www.ytu.edu.cn/ydxw/" + newsInfo[0]
cnt = cnt - 1
for i in pageurl:
print(i)
23.4.11 数据处理 + 画图 + 数据统计
1.读取书籍信息book_list.csv,对数据进行预处理:如果书名存在重复,则只保留第一条记录,其他同名书的记录删除;
2.统计总出版量前3的出版社, 使用条状图对结果进行显示;
3. 按年对各出版社当年的图书出版数量进行统计 2009-2016
"""
Created on Mon Apr 10 23:58:10 2023
@author: Lcy
"""
import pandas as pd
from pybloom_live import ScalableBloomFilter
from matplotlib import pyplot as plt
import matplotlib as mtb
if __name__ == '__main__':
mtb.rcParams['font.sans-serif'] = ["SimHei"]
mtb.rcParams["axes.unicode_minus"] = False
vis = ScalableBloomFilter(initial_capacity = 100 , error_rate = 0.001)
data = pd.read_csv('book_list.csv')
#print(data.head(0))
#数据预处理 : 删除同名书籍-------------------------------------------------------------
n = data.shape[0]
# print(n)
delx = []
for i in range(0 , n):
book_name = data.iloc[i , 1]
if book_name in vis:
delx.append(i)
vis.add(book_name)
for i in delx:
data = data.drop(i , axis = 0)
#检查是否删除成功
n = data.shape[0]
# print(n)
#统计出版量前三的出版社条状图统计显示---------------------------------------------------
new_data = data.loc[ : , '出版社'].str.split('/')
# print(new_data)
count = {}
for i in new_data:
if i[0][7:] not in count:
count[i[0][7:]] = 1;
else:
count[i[0][7:]] += 1;
info = pd.Series(count)
info = info.sort_values(ascending = False)[0:3]
# print(info)
#作图---------------------------------------------------------------------------------
plt.figure(figsize = (15,5) , dpi = 80)
x = info.index
# print(x)
y = info.values
plt.bar(x , y , width = 0.5, color = "#4B0082" )
plt.xlabel("出版社名称")
plt.ylabel("作品数量")
plt.title("出版数量前三出版社")
plt.grid(alpha = 0.3 , color = "#000000")
plt.show()
#按年对各出版社当年的出版数量进行统计
new_count = {}
for i in new_data:
name = i[0][7:]
year = i[1][1:5]
if year.isdigit() == 0:
continue
year = int(year)
if year >= 2009 and year <= 2016:
new_count.setdefault(name,[]).append(year)
for i in new_count:
data_s = pd.Series(new_count[i]).value_counts()
print('出版社名称:{}'.format(i))
print(data_s)