医疗知识图谱笔记（二）

最新推荐文章于 2024-01-27 12:51:01 发布

秦小茗同学

最新推荐文章于 2024-01-27 12:51:01 发布

阅读量377

点赞数 1

分类专栏： 2020/07 文章标签：机器学习 python

本文链接：https://blog.csdn.net/weixin_37546542/article/details/107055437

版权

2020/07 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

1.re库

import re
# 从字符串中匹配是否有该模板
print(re.search(pattern = 'w{2}', string = 'www.runoob.com')) 
# 从字符串中替换掉该模板
print(re.sub(pattern = '#.*$', repl = "", string = "2004-959-559 # 这是一个国外电话号码"))
# 从字符串中找到所有匹配的子串
print(re.findall(pattern='\d+', string='runoob 123 google 456'))
# 将字符串根据模板进行分割
print(re.split(pattern = "\d+",string ="12a32bc43jf3") )
# w3c的正则表达式教程
# https://www.w3cschool.cn/zhengzebiaodashi/

E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/re.py
<re.Match object; span=(0, 2), match='ww'>
2004-959-559 
['123', '456']
['', 'a', 'bc', 'jf', '']

Process finished with exit code 0

re库的三个函数： findall sub split

2.pandas 库

import pandas as pd
import numpy as np
dates=pd.date_range('20180310',periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=['A','B','C','D'])#生成6行4列位置
print(df)#输出6行4列的表格
'''
                   A         B         C         D
2018-03-10 -0.092889 -0.503172  0.692763 -1.261313
2018-03-11 -0.895628 -2.300249 -1.098069  0.468986
2018-03-12  0.084732 -1.275078  1.638007 -0.291145-*9
2018-03-13 -0.561528  0.431088  0.430414  1.065939
2018-03-14  1.485434 -0.341404  0.267613 -1.493366
2018-03-15 -1.671474  0.110933  1.688264 -0.910599
  '''
print(df['B'])
'''
2018-03-10   -0.927291
2018-03-11   -0.406842
2018-03-12   -0.088316
2018-03-13   -1.631055
2018-03-14   -0.929926
2018-03-15   -0.010904
Freq: D, Name: B, dtype: float64
 '''

#创建特定数据的DataFrame
df_1=pd.DataFrame({'A' : 1.,
                    'B' : pd.Timestamp('20180310'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : 'foo'
                    })
print(df_1)
'''
     A          B    C  D      E    F
0  1.0 2018-03-10  1.0  3   test  foo
1  1.0 2018-03-10  1.0  3  train  foo
2  1.0 2018-03-10  1.0  3   test  foo
3  1.0 2018-03-10  1.0  3  train  foo
'''
print(df_1.dtypes)
'''
A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object
'''
print(df_1.index)#行的序号
#Int64Index([0, 1, 2, 3], dtype='int64')
print(df_1.columns)#列的序号名字
'''
Int64Index([0, 1, 2, 3], dtype='int64')
Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
'''
#Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')
print(df_1.values)#把每个值进行打印出来
'''
[[1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-10 00:00:00') 1.0 3 'train' 'foo']]
 '''
print(df_1.describe())#数字总结
'''
         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0
'''
print(df_1.T)#翻转数据
'''
                     0                    1                    2  \
A                    1                    1                    1   
B  2018-03-10 00:00:00  2018-03-10 00:00:00  2018-03-10 00:00:00   
C                    1                    1                    1   
D                    3                    3                    3   
E                 test                train                 test   
F                  foo                  foo                  foo   

                     3  
A                    1  
B  2018-03-10 00:00:00  
C                    1  
D                    3  
E                train  
F                  foo  
'''
print(df_1.sort_index(axis=1, ascending=False))#axis等于1按列进行排序 如ABCDEFG 然后ascending倒叙进行显示
'''
     F      E  D    C          B    A
0  foo   test  3  1.0 2018-03-10  1.0
1  foo  train  3  1.0 2018-03-10  1.0
2  foo   test  3  1.0 2018-03-10  1.0
3  foo  train  3  1.0 2018-03-10  1.0
'''
print(df_1.sort_values(by='E'))#按值进行排序
'''
     A          B    C  D      E    F
0  1.0 2018-03-10  1.0  3   test  foo
2  1.0 2018-03-10  1.0  3   test  foo
1  1.0 2018-03-10  1.0  3  train  foo
3  1.0 2018-03-10  1.0  3  train  foo
'''

3.json

import json
# json 数据，和Python中的dict数据形式一样
data ={
    "第一个key":"第一个value",
    "第二个key":"第二个value"
    }
print('原生json数据',data)
# 将json转换成str，方便在文件中保存
data_str = json.dumps(data)
print('json转成str',data_str)
# 将str转换成json,方便在Python的调用
data_json = json.loads(data_str)
print('从str转成json',data_json)
# 将一个json对象直接保存在文件中
with open('json.txt','w') as f :
    json.dump(data_json,f)

# 将一个保存json对象的文件直接转成字符串
with open('json.txt','r') as f :
    data_json_exchange = json.load(f)
    print('从文件中获得json数据',data_json_exchange)

E:\miniconda\python.exe E:/BaiduNetdiskDownload/05NLP项目——医疗知识图谱项目/课时附件资料/课时0/课时0/附件/json.py
原生json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}
json转成str {"\u7b2c\u4e00\u4e2akey": "\u7b2c\u4e00\u4e2avalue", "\u7b2c\u4e8c\u4e2akey": "\u7b2c\u4e8c\u4e2avalue"}
从str转成json {'第一个key': '第一个value', '第二个key': '第二个value'}
从文件中获得json数据 {'第一个key': '第一个value', '第二个key': '第二个value'}

Process finished with exit code 0

4. gensim词向量库

from gensim.models import Word2Vec
from random import choice
temp =[
    ['用来','测试','的','分词','之后','的','第一','句','话'],
    ['我','随便','写','的','一','句','话']
]
ls_of_words = []  # 存放分词列表的列表
for i in range(1500):
    ls = choice(temp)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])
# 训练词向量模型，主要的参数就输输入文本，其他的参数影不是很大
model = Word2Vec(ls_of_words)
# 得到最想似的词
print(model.similar_by_word('用来'))
# 计算两者之间的相似度
print(model.similarity('用来', '测试'))


# 词向量聚类及可视化
from random import choice
ls_of_ls = [['芝士', '酸奶', '蛋糕', '巧克力', '做', '吃'],
            ['文本', '数据', '挖掘', '分析', '做', '玩'],
            ['佛山', '广州', '南海', '天河', '吃', '玩']]
ls_of_words = []  # 存放分词列表（假设是jieba.lcut后得到的）的列表
for i in range(2500):
    ls = choice(ls_of_ls)
    ls_of_words.append([choice(ls) for _ in range(9, 15)])

# 建模训练
from gensim.models import Word2Vec
model = Word2Vec(ls_of_words, size=3, window=7)

# 词向量聚类（基于密度）
from sklearn.cluster import DBSCAN
vectors = [model[word] for word in model.wv.index2word]
labels = DBSCAN(eps=0.24, min_samples=3).fit(vectors).labels_

# 词向量可视化
import matplotlib
from mpl_toolkits import mplot3d
import matplotlib.pyplot as mp
mp.rcParams['font.sans-serif'] = ['SimHei']  # 显示中文
matplotlib.rcParams['axes.unicode_minus'] = False  # 显示负号
fig = mp.figure()
ax = mplot3d.Axes3D(fig)  # 创建3d坐标轴
colors = ['red', 'blue', 'green', 'black']
for word, vector, label in zip(model.wv.index2word, vectors, labels):
    ax.scatter(vector[0], vector[1], vector[2], c=colors[label], s=500, alpha=0.4)
    ax.text(vector[0], vector[1], vector[2], word, ha='center', va='center')
mp.show()

5.collection

import collections
# 计数器
print(collections.Counter('abcdeabcdabcaba'))
# 双向链表
q= collections.deque(['a','b','c'])
q.append('x')
q.appendleft('y')
print(q)
# 默认字典，及当字典的key不存在时填写默认值
dic = collections.defaultdict(lambda :'N/A')
dic['k1'] = 'abc'
print(dic['k1']) #‘abc’
print(dic['k2']) #N/A
# 有序字典,写入顺序是唯一的
print('Normal Dictionary:')
d = {}
d['age'] = 'v2'
d['job'] = 'v3'
d1 = {}
d1['job'] = 'v3'
d1['age'] = 'v2'
print(d == d1)
print('OrderedDict:')
d2 = OrderedDict()
d2['age'] = 'v2'
d2['job'] = 'v3'
d3 = OrderedDict()
d3['job'] = 'v3'
d3['age'] = 'v2'
print(d2 == d3)