python使用总结

最新推荐文章于 2021-08-06 08:38:06 发布

风息神怒

最新推荐文章于 2021-08-06 08:38:06 发布

阅读量321

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/u012260341/article/details/78940919

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

本文就项目当中经常用到的python进行总结。
enumerate: 对可迭代对象进行迭代，同时将对下标进行迭代
for i, val in enumerate(words):
print i, val,
extend:对列表进行扩充
a = [1,2,3] b = [3,4,5] a.extend(b) # [1,2,3,4,5,6]
词频统计：
import collections
words = ["I", "have", "I"]
a = collections.Counter(words) # Counter({"I":2, "have":1})
字典排序，按照值从大到小排序：
counter_list = sorted(dict1.items(), key=lambda x: x[1], reverse=True)
zip:
同时迭代多个列表：
for i, j in zip(x, y):
print (i, j)
列表构造字典：
dict(zip(list1, list2))
zip创建的结果是迭代器,要得到列表则要在zip后加list()
获取字符串中某个字符出现的下标：
方法一：
str1 = 'aasssa aa aasas'
result = []
for index, letter in enumerate(str1):
if letter == 'a':
result.append(index)
方法一的问题
每次获取到符合条件的结果，都要调用append方法。但实际上我们的关注点根本不在这个方法，它只是我们达成目的的手段，实际上只需要index就好了
返回的result可以继续优化
数据都存在result里面，如果数据量很大的话，会比较占用内存
方法二：
使用生成器generator，yeild表达式
调用生成器时，它不会真的执行，而是返回一个迭代器，每次在迭代器上调用内置的next函数时，迭代器会把生成器推进到下一个yield表达式。yield要在函数体内使用
def myGenerator(string):
for index, letter in enumerate(str1):
if letter == 'a':
yield index

indexs = myGenerator(str1)
for i in indexs:
print i,

yield的使用：

def myFrangeGenerator(start, stop, step):
x = start
while x<stop:
yield x
x += step

for i in myFrangeGenerator(0, 5, 0.5):

print i,

路径的使用：
判断路径下是否是txt后缀的文件:
print "d://123.bin".endswith('.txt') # Fasle
获取路径最后一个部分:
path = "D:/123/test.txt"
print os.path.basename(path) # test.txt
新建一个当前目录下文件名后缀修改的文件
filename = os.path.basename(path).split('.')[0]
file1 = filename + '.bin' # test.bin
获取文件所在目录：
filedir = os.path.dirname(path) # D:/123/
将路径成分进行组合：
os.path.join('tmp', 'data', os.path.basename(path)) # tmp\data\test.txt
扩展用户根目录：
path = '~/123/test.txt'
print os.path.expanduser(path) # C:\Users/123/test.txt
划分文件扩展
print os.path.split(path)
# ('~/123', 'test.txt')
检测文件或目录是否存在：
os.path.exists(path)
os.path.isfile(path)
os.path.isdir(path)
获取路径下的文件大小：
os.path.getsize(path)
获取目录内容的列表:
names = os.listdir('D:/')
print names
获取当前目录下的所有文件：
names = [name for name in os.listdir('D:/') if os.path.isfile(os.path.join('D:/', name))]
print names
获取当前目录下的所有目录：
dirs = [dir for dir in os.listdir('D:/') if os.path.isdir(os.path.join('D:/', dir))]
print dirs
筛选出后缀文件：
txtfiles = [name for name in os.listdir('D:/') if name.endswith('.txt')]
print txtfiles # 只包含文件名
或者glob模块
import glob
txtfiles = glob.glob('D:/*.txt')
print txtfiles # 包含D:/
打印当前目录带.的文件
files = os.listdir('.')
print files

文本预处理：
切分：
line = 'hello,world'
linelist1 = line.split(',') # 按','进行切分，得到列表
linelist2 = line.split() # 按空格划分，包括连续空格
linelist3 = line.strip() #去除字符串两侧的空格
连接：
line1 = " ".join(linelist1) # 将列表中的元素用' '链接起来,
print line1 # 输出hello world
替代：
print line.replace(' ', '') # 不会改变line本身，打印helloworld
正则：将字符串中多个连续空格替换成一个空格
import re
print re.sub('\s+', ' ', line.strip()) # hello world
字符串拼接：
str1 = 'hello'
str2 = 'world'
print str1+' '+str2 # hello world
print ('{} {}'.format(str1, str2)) # hello world
print 'hello' 'world' # helloworld
字符串的连接问题，使用+操作符特别低效
建议使用生成表达式
data = ['i', 'have', 1, 'dream', ]
print ' '.join(str(d) for d in data) # i have 1 dream
处理文本中的英文，数字：
str3 = '10'
print str3.isdigit() # True
print str3.isalnum() # True
str4 = 'hi'
print str4.islower() # True
print str4.isupper() # False
print str4.isalpha() # True
获取下标：
word = 'A.B.'
print word.index('.')
求交集并集：
intersection = list(set(a).intersection(set(b)))

union = list(set(a).union(set(b)))

容器操作：
字典拼接
dict1 = {'1': 'hi', '2': ['haha', 'hihi']}
d2 = dict({}, **dict1) # {'1': 'hi', '2': ['haha', 'hihi']}
d3 = {'3': '222'}
d3 = dict(d3, **dict1) # {'1': 'hi', '3': '222', '2': ['haha', 'hihi']}
扁平化输出：
def flatten(ll):
if isinstance(ll, list):
for i in ll:
for element in flatten(i):
yield element
else:
yield ll
testcase= ['and', 'B', ['not', 'A'],[1,2,1,[2,1],[1,1,[2,2,1]]], ['not', 'A', 'A'],['or', 'A', 'B' ,'A'] , 'B']
print list(flatten(testcase)) #['and', 'B', 'not', 'A', 1, 2, 1, 2, 1, 1, 1, 2, 2, 1, 'not', 'A', 'A', 'or', 'A', 'B', 'A', 'B']
字典取值：# get()若存在则返回值，否则返回预设值，比如none
mydict = {"liu":'1', "jiang": '2', "tu": '-10'}
print mydict.get("liu", 'none') # 1
print mydict.get("zhang", 'none') # none
列表拼接：
print [1, 2, 3]+[4, 5, 6] # [1,2,3,4,5,6]
列表加值：
la = [1, 2, 3]
val = 5
val2 = 2
la.append(val or val2) # 若val为0则添加val2，否则添加val1
列表求列和：
l1 = [[1,2,3], [1,2,3]]
col = map(sum, zip(*l1))
依次输出两个列表的元素：
from itertools import chain
a = [1, 2, 3, 4]
b = ['x', 'y']
依次输出两个容器中的元素，无需两个循环语句，更简洁且高效
for item in chain(a, b):
print item
列表筛选：
list2 = filter(lambda x: x not in list1, list2) #只保留list2中不在list1中的元素
嵌套列表的组合提取：
from collections import Iterable
a = []
def flatten(items, ignore_types=(str, bytes)):
for x in items:
if isinstance(x, Iterable) and not isinstance(x, ignore_types):
b = []
for item in flatten(x):
yield item
b.append(item)
a.append(b)
else:
yield x
items = [1, [[3, [2, 4]], [5, [9, 10]]]]
c = []
for x in flatten(items):
c.append(x)
a.append(c)

print a #[[2, 4], [3, 2, 4], [9, 10], [5, 9, 10], [3, 2, 4, 5, 9, 10], [1, 3, 2, 4, 5, 9, 10]]

层次聚类：
import scipy.cluster.hierarchy as sch
dist = [[1, 1], [1, 2], [4, 3], [4, 2.99]]
disMat = sch.distance.pdist(dist, metric='cosine') #计算两两样本点间的距离
Z = sch.linkage(disMat, method='average', metric='cosine')
tree = sch.to_tree(Z, rd=True) #层次聚类树
cluster = sch.fcluster(Z, t=0.6, criterion='inconsistent') #聚类阈值t=0.6，距离低于该值才会融合
sch.dendrogram(Z) #层次聚类图绘制

计算tf-idf值：
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['I have a dream hello', 'computer hello world', 'hello world']
# 该类会将文本中的词语转换为词频矩阵，矩阵元素a[i][j] 表示j词在i类文本下的词频
vectorizer = CountVectorizer()
transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值
# 第一个fit_transform是计算tf-idf，第二个fit_transform是将文本转为词频矩阵
tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus))
print tfidf.toarray() # 得到语料库中每个句子的tfidf向量
tfidf_dict_words_list = vectorizer.get_feature_names() # 获取词袋模型中的所有词语

tsne可视化工具：
import matplotlib.pyplot as plt
from sklearn import manifold
# 可视化工具tsne的使用
def word_visualize_tsne(w2v_dict):
print("Computing t-SNE embedding")
tsne = manifold.TSNE(n_components=2, init='pca', random_state=0)
top_n_dict = {}
y = w2v_dict.keys()
X = w2v_dict.values()
# 将X拟合变换到向量空间中

X_tsne = tsne.fit_transform(X)

i = 0
for item in xrange(len(w2v_dict)):
if y[i] in top_n_dict:
# 描点
plt.scatter(X_tsne[i, 0], X_tsne[i, 1])
# 标注
plt.annotate(y[i], xy=(X_tsne[i, 0], X_tsne[i, 1]), xytext=(X_tsne[i, 0], X_tsne[i, 1]))
i += 1
plt.show()

数值计算：
计算二范数：
from scipy.linalg import norm
print norm([3, 4]) # 5
点乘：
from scipy import dot
print dot([1, 2], [2, 2.5]) # 点乘
取对数：
import scipy
print scipy.log(3)
计算余弦相似度：
A = [1, 2, -1]
A = numpy.matrix(A)
B = [2, 4, 6]
B = numpy.matrix(B)
num = float(A * B.T) # 若为行向量则 A * B.T
denom = numpy.linalg.norm(A) * numpy.linalg.norm(B)
cos = num / denom # 余弦值
sim = 0.5 + 0.5 * cos # 归一化

风息神怒

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python使用总结

本文就项目当中经常用到的python进行总结。enumerate: 对可迭代对象进行迭代，同时将对下标进行迭代for i, val in enumerate(words): print i, val,extend:对列表进行扩充a = [1,2,3] b = [3,4,5] a.extend(b) # [1,2,3,4,5,6]词频统计：i
复制链接

扫一扫