数据科学导论作业

23.3.8

(数据 + 文件处理)

从给定的5个招聘信息数据文件(附件data.rar文件中)中提取出所有的电话号码和邮箱地址,并将数据保存到results.txt文件中

"""
Finished by Lcy
2023.3.6
"""
import re

Pattern_Mail_com = "(.*.com)"
Pattern_Mail_cn = "(.*.cn)"
Pattern_Phone_A = "(\d{11})"
Pattern_Phone_B = "(\d{3}-\d{8})"

def Use(Pattern , file):
    
    Data_Mail = re.findall(Pattern, file)
    
    for x in Data_Mail:

        t.write(x)
        t.write('\n')
        


if __name__ == "__main__":
    
    
    for i in range(1,6):
        path = "岗位" + str(i) + ".txt"
        data = open(path , 'r' , encoding = "utf-8")
        
        t = open("results.txt" , 'a') #追加方式打开
        t.write("这是岗位" + str(i) + "的信息:")
        t.write('\n')
    
        for file in data.readlines():
            
            Use(Pattern_Mail_com,file)
            data.seek(0)
        
        for file in data.readlines():
            
            Use(Pattern_Mail_cn,file)
            data.seek(0)
            
        for file in data.readlines():
            
            Use(Pattern_Phone_A,file)
            data.seek(0)
        
        for file in data.readlines():
            
            Use(Pattern_Phone_B,file)
            data.seek(0)
        
        t.write('\n')
        t.close() #打开后记得关闭
        

23.3.16

爬虫

在这里插入图片描述

# -*- coding: utf-8 -*-
"""
Published By Li
"""
from pybloom_live import ScalableBloomFilter
#布隆过滤器(数据结构) , 比map快
import requests , re

bloom = ScalableBloomFilter(initial_capacity=100, error_rate=0.001)
pageurl = []
# header={"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36"}


if __name__ == "__main__":
    
    seed = "https://www.ytu.edu.cn/ydxw/xxyw.htm"
    
    cnt = 10
    
    while(cnt):
    
        if seed in bloom:
            continue
        
        bloom.add(seed)
        pageurl.append(seed)
        
        
        # 忽略安全证书验证会产生警告 , 用这个语句取去省去警告
        requests.packages.urllib3.disable_warnings()
        r = requests.get(seed , timeout = 50 , verify = False) # 你需要的网址
        #verify = False    忽略https安全证书的验证 
        r.raise_for_status() # 异常触发语句    
        r.encoding = r.apparent_encoding # 确定编码语句
            
        pattern = "<span class=\"p_next p_fun\"><a href=\"(.*)\">下页</a></span>"
        
        newsInfo = re.findall(pattern, r.text )
        
        seed = "https://www.ytu.edu.cn/ydxw/" + newsInfo[0]
        
        cnt = cnt - 1

for i in pageurl:
    print(i)

23.4.11 数据处理 + 画图 + 数据统计

1.读取书籍信息book_list.csv,对数据进行预处理:如果书名存在重复,则只保留第一条记录,其他同名书的记录删除;
2.统计总出版量前3的出版社, 使用条状图对结果进行显示;
3. 按年对各出版社当年的图书出版数量进行统计 2009-2016

"""
Created on Mon Apr 10 23:58:10 2023
@author: Lcy
"""

import pandas as pd
from pybloom_live import ScalableBloomFilter
from matplotlib import pyplot as plt
import matplotlib as mtb

if __name__ == '__main__':
    
    mtb.rcParams['font.sans-serif'] = ["SimHei"]
    mtb.rcParams["axes.unicode_minus"] = False
    vis = ScalableBloomFilter(initial_capacity = 100 , error_rate = 0.001)
    data = pd.read_csv('book_list.csv')
    
    #print(data.head(0))
    
    #数据预处理 : 删除同名书籍-------------------------------------------------------------
    n = data.shape[0]
    # print(n)
    
    delx = []
    for i in range(0 , n):
        book_name = data.iloc[i , 1]
        if book_name in vis:
            delx.append(i)
        vis.add(book_name)
        
    for i in delx:
        data = data.drop(i , axis = 0)
    
    #检查是否删除成功
    n = data.shape[0]
    # print(n)
    
    #统计出版量前三的出版社条状图统计显示---------------------------------------------------
    new_data = data.loc[ : , '出版社'].str.split('/')
    # print(new_data)
    
    count = {}
    
    for i in new_data:
        if i[0][7:] not in count:
            count[i[0][7:]] = 1;
        else:
            count[i[0][7:]] += 1;
        
    info = pd.Series(count)
    info = info.sort_values(ascending = False)[0:3]
    
    # print(info)
    
    #作图---------------------------------------------------------------------------------
    
    plt.figure(figsize = (15,5) , dpi = 80)
    x = info.index
    # print(x)
    y = info.values
    plt.bar(x , y ,  width = 0.5, color = "#4B0082"  )
    plt.xlabel("出版社名称")
    plt.ylabel("作品数量")
    plt.title("出版数量前三出版社")
    plt.grid(alpha = 0.3 , color = "#000000")
    plt.show()
    
    #按年对各出版社当年的出版数量进行统计
    
    new_count = {}
    
    for i in new_data:
        name = i[0][7:]
        year = i[1][1:5]
        if year.isdigit() == 0:
            continue
        year = int(year)
        if year >= 2009 and year <= 2016:
            new_count.setdefault(name,[]).append(year)
    
    for i in new_count:
        data_s = pd.Series(new_count[i]).value_counts()
        print('出版社名称:{}'.format(i))
        print(data_s)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

草莓猫猫软糖

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值