扩展库pandas
简介
- 基于numpy的数据分析模块,提供大量的标准数据模型和高效操作大型数据集所需要的工具
- 主要提供了3种数据结构:1.Series,带标签的一维数组。2.DataFrame,带标签且大小可变的二维表格结构。3.Panel,带标签且大小可变的三维数组
生成一维数组
>>> import pandas as pd
>>> import numpy as np
>>> x= pd.Series([1,3,5,np.nan])
>>> x
0 1.0
1 3.0
2 5.0
3 NaN
dtype: float64
生成二维数组
- pd.DataFrame第一个参数是存放在DataFrame里的数据,第二个参数index是行名,第三个参数columns是列名
>>> datas=pd.date_range(start='20210226',end='20211231',freq='D') #间隔为天
>>> datas
DatetimeIndex(['2021-02-26', '2021-02-27', '2021-02-28', '2021-03-01',
'2021-03-02', '2021-03-03', '2021-03-04', '2021-03-05',
'2021-03-06', '2021-03-07',
...
'2021-12-22', '2021-12-23', '2021-12-24', '2021-12-25',
'2021-12-26', '2021-12-27', '2021-12-28', '2021-12-29',
'2021-12-30', '2021-12-31'],
dtype='datetime64[ns]', length=309, freq='D')
>>> datas=pd.date_range(start='20210226',end='20211231',freq='M') #间隔为月
>>> datas
DatetimeIndex(['2021-02-28', '2021-03-31', '2021-04-30', '2021-05-31',
'2021-06-30', '2021-07-31', '2021-08-31', '2021-09-30',
'2021-10-31', '2021-11-30', '2021-12-31'],
dtype='datetime64[ns]', freq='M')
>>> df=pd.DataFrame([[np.random.randint(1,100)for j in range(4)] for i in range(11)],index=datas,columns=list('ABCD'))
>>> df
A B C D
2021-02-28 23 26 76 71
2021-03-31 56 38 91 60
2021-04-30 84 67 75 87
2021-05-31 56 63 19 24
2021-06-30 7 55 18 16
2021-07-31 51 89 86 74
2021-08-31 77 38 22 60
2021-09-30 90 28 27 87
2021-10-31 37 37 14 5
2021-11-30 18 65 93 12
2021-12-31 88 57 60 59
>>>
>>> df=pd.DataFrame({'A':[np.random.randint(1,100)for i in range(4)],'B':pd.date_range(start='20130101',periods=4,freq='D'),'C':pd.Series([1,2,3,4],index=['zhang','li','zhou','wang'],dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})
>>> df
A B C D E F
zhang 29 2013-01-01 1.0 3 test foo
li 72 2013-01-02 2.0 3 train foo
zhou 80 2013-01-03 3.0 3 test foo
wang 73 2013-01-04 4.0 3 train foo
>>>
二维数组数据查看
>>> df.head() #默认显示前5行
A B C D E F
zhang 29 2013-01-01 1.0 3 test foo
li 72 2013-01-02 2.0 3 train foo
zhou 80 2013-01-03 3.0 3 test foo
wang 73 2013-01-04 4.0 3 train foo
>>> df.head(3) #查看前3行
A B C D E F
zhang 29 2013-01-01 1.0 3 test foo
li 72 2013-01-02 2.0 3 train foo
zhou 80 2013-01-03 3.0 3 test foo
>>> df.tail(2) #查看最后2行
A B C D E F
zhou 80 2013-01-03 3.0 3 test foo
wang 73 2013-01-04 4.0 3 train foo
查看数据的统计信息
>>> df.describe() #返回平均值、标准差、最小值、最大值等信息
A C D
count 4.000000 4.000000 4.0
mean 63.500000 2.500000 3.0
std 23.273733 1.290994 0.0
min 29.000000 1.000000 3.0
25% 61.250000 1.750000 3.0
50% 72.500000 2.500000 3.0
75% 74.750000 3.250000 3.0
max 80.000000 4.000000 3.0
二维数据转置
>>> df.T
zhang ... wang
A 29 ... 73
B 2013-01-01 00:00:00 ... 2013-01-04 00:00:00
C 1 ... 4
D 3 ... 3
E test ... train
F foo ... foo
[6 rows x 4 columns]
排序
>>> df.sort_index(axis=0,ascending=False) #对轴进行排序
A B C D E F
zhou 80 2013-01-03 3.0 3 test foo
zhang 29 2013-01-01 1.0 3 test foo
wang 73 2013-01-04 4.0 3 train foo
li 72 2013-01-02 2.0 3 train foo
>>> df.sort_index(axis=1,ascending=False)
F E D C B A
zhang foo test 3 1.0 2013-01-01 29
li foo train 3 2.0 2013-01-02 72
zhou foo test 3 3.0 2013-01-03 80
wang foo train 3 4.0 2013-01-04 73
>>> df.sort_values(by='A') #对数据进行排序
A B C D E F
zhang 29 2013-01-01 1.0 3 test foo
li 72 2013-01-02 2.0 3 train foo
wang 73 2013-01-04 4.0 3 train foo
zhou 80 2013-01-03 3.0 3 test foo
>>> df.sort_values(by='A',ascending=False) #降序排列
A B C D E F
zhou 80 2013-01-03 3.0 3 test foo
wang 73 2013-01-04 4.0 3 train foo
li 72 2013-01-02 2.0 3 train foo
zhang 29 2013-01-01 1.0 3 test foo
数据选择
>>> import pandas as pd
>>> import numpy as np
>>> df=pd.DataFrame({'A':[np.random.randint(1,100)for i in range(4)],'B':pd.date_range(start='20130101',periods=4,freq='D'),'C':pd.Series([1,2,3,4],index=['zhang','li','zhou','wang'],dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})
>>> df
A B C D E F
zhang 55 2013-01-01 1.0 3 test foo
li 52 2013-01-02 2.0 3 train foo
zhou 83 2013-01-03 3.0 3 test foo
wang 64 2013-01-04 4.0 3 train foo
>>> df['A'] #选择列
zhang 55
li 52
zhou 83
wang 64
Name: A, dtype: int64
>>> df[0:2] #使用切片选择多行
A B C D E F
zhang 55 2013-01-01 1.0 3 test foo
li 52 2013-01-02 2.0 3 train foo
>>> df.loc[:,['A','C']] #选择多列
A C
zhang 55 1.0
li 52 2.0
zhou 83 3.0
wang 64 4.0
>>> df.loc[['zhang','zhou'],['A','D','E']] #同时指定多行与多列进行选择
A D E
zhang 55 3 test
zhou 83 3 test
>>> df.at['zhang','A']
55
>>> df.at['zhang','D'] #查询指定行列位置的数据值
3
>>> df.iloc[3] #查询第3行数据
A 64
B 2013-01-04 00:00:00
C 4
D 3
E train
F foo
Name: wang, dtype: object
>>> df.iloc[0:3,0:4] #查询前3行、前4列数据
A B C D
zhang 55 2013-01-01 1.0 3
li 52 2013-01-02 2.0 3
zhou 83 2013-01-03 3.0 3
>>> df.iloc[[0,2,3],[0,4]] #查询指定的多行、多列数据
A E
zhang 55 test
zhou 83 test
wang 64 train
>>> df.iloc[0,1] #查询指定行、列位置的数据值
Timestamp('2013-01-01 00:00:00')
>>> df[df.A>50] #按给定条件进行查询
A B C D E F
zhang 55 2013-01-01 1.0 3 test foo
li 52 2013-01-02 2.0 3 train foo
zhou 83 2013-01-03 3.0 3 test foo
wang 64 2013-01-04 4.0 3 train foo
>>>
数据修改与设置
>>> df.iat[0,2]=3 #修改指定行、列位置的数据值
>>> df
A B C D E F
zhang 55 2013-01-01 3.0 3 test foo
li 52 2013-01-02 2.0 3 train foo
zhou 83 2013-01-03 3.0 3 test foo
wang 64 2013-01-04 4.0 3 train foo
>>> df.loc[:,'D']=[np.random.randint(50,60) for i in range(4)] #修改某列的值
>>> df
A B C D E F
zhang 55 2013-01-01 3.0 58 test foo
li 52 2013-01-02 2.0 52 train foo
zhou 83 2013-01-03 3.0 56 test foo
wang 64 2013-01-04 4.0 57 train foo
>>> df['C']=-df['C'] #对指定列数据取反
>>> df
A B C D E F
zhang 55 2013-01-01 -3.0 58 test foo
li 52 2013-01-02 -2.0 52 train foo
zhou 83 2013-01-03 -3.0 56 test foo
wang 64 2013-01-04 -4.0 57 train foo
>>>
缺失值处理
- 缺失值和异常值处理是大数据预处理环节中很重要的一个步骤
>>> df1=df.reindex(index={'zhang','li','zhou','wang'},columns=list(df.columns)+['G'])
>>> df1.iat[0,6]=3 #修改指定位置的元素值,该列其他元素为缺失值NAN
>>> df1
A B C D E F G
wang 64 2013-01-04 -4.0 57 train foo 3.0
zhou 83 2013-01-03 -3.0 56 test foo NaN
zhang 55 2013-01-01 -3.0 58 test foo NaN
li 52 2013-01-02 -2.0 52 train foo NaN
>>> pd.isnull(df1) #测试缺失值,返回值为True/False阵列
A B C D E F G
wang False False False False False False False
zhou False False False False False False True
zhang False False False False False False True
li False False False False False False True
>>> df1.dropna() #返回不包含缺失值的行
A B C D E F G
wang 64 2013-01-04 -4.0 57 train foo 3.0
>>> df1['G'].fillna(5,inplace=True) #使用指定值填充缺失值
>>> df1
A B C D E F G
wang 64 2013-01-04 -4.0 57 train foo 3.0
zhou 83 2013-01-03 -3.0 56 test foo 5.0
zhang 55 2013-01-01 -3.0 58 test foo 5.0
li 52 2013-01-02 -2.0 52 train foo 5.0
>>>
数据操作
>>> df1.mean() #平均值,自动忽略缺失值
A 63.50
C -3.00
D 55.75
G 4.50
dtype: float64
>>> df.mean(1) #横向计算平均值
zhang 36.666667
li 34.000000
zhou 45.333333
wang 39.000000
dtype: float64
>>> df1.shift(1) #数据移位
A B C D E F G
wang NaN NaT NaN NaN NaN NaN NaN
zhou 64.0 2013-01-04 -4.0 57.0 train foo 3.0
zhang 83.0 2013-01-03 -3.0 56.0 test foo 5.0
li 55.0 2013-01-01 -3.0 58.0 test foo 5.0
>>> df1['D'].value_counts() #直方图统计
52 1
58 1
57 1
56 1
Name: D, dtype: int64
>>> df2=pd.DataFrame(np.random.randn(10,4))
>>> df2
0 1 2 3
0 -0.247972 -0.764663 1.239467 -1.528499
1 -0.223106 1.004240 -1.043497 0.202809
2 0.950814 -1.006138 -0.184229 -0.083002
3 1.146779 1.361694 -0.841514 -0.300036
4 -1.719960 0.372381 0.609789 0.604840
5 0.398698 -0.970149 -1.904028 -0.064732
6 -1.206135 -1.365520 0.759790 0.331760
7 -2.112486 -1.878848 -1.827505 1.041786
8 0.190555 0.895478 -0.082711 -1.325795
9 1.194263 -0.642249 1.307098 -0.257796
>>> p1=df2[:3] #数据行拆分
>>> p1
0 1 2 3
0 -0.247972 -0.764663 1.239467 -1.528499
1 -0.223106 1.004240 -1.043497 0.202809
2 0.950814 -1.006138 -0.184229 -0.083002
>>> p2=df2[3:7]
>>> p2
0 1 2 3
3 1.146779 1.361694 -0.841514 -0.300036
4 -1.719960 0.372381 0.609789 0.604840
5 0.398698 -0.970149 -1.904028 -0.064732
6 -1.206135 -1.365520 0.759790 0.331760
>>> p3=df2[7:]
>>> p3
0 1 2 3
7 -2.112486 -1.878848 -1.827505 1.041786
8 0.190555 0.895478 -0.082711 -1.325795
9 1.194263 -0.642249 1.307098 -0.257796
>>> df3=pd.concat([p1,p2,p3]) #数据行合并
>>> df3
0 1 2 3
0 -0.247972 -0.764663 1.239467 -1.528499
1 -0.223106 1.004240 -1.043497 0.202809
2 0.950814 -1.006138 -0.184229 -0.083002
3 1.146779 1.361694 -0.841514 -0.300036
4 -1.719960 0.372381 0.609789 0.604840
5 0.398698 -0.970149 -1.904028 -0.064732
6 -1.206135 -1.365520 0.759790 0.331760
7 -2.112486 -1.878848 -1.827505 1.041786
8 0.190555 0.895478 -0.082711 -1.325795
9 1.194263 -0.642249 1.307098 -0.257796
>>> df2 == df3 #测试两个二维数据是否相等,返回True/False阵列
0 1 2 3
0 True True True True
1 True True True True
2 True True True True
3 True True True True
4 True True True True
5 True True True True
6 True True True True
7 True True True True
8 True True True True
9 True True True True
>>> df4=pd.DataFrame({'A':[np.random.randint(1,5) for i in range(8)],'B':[np.random.randint(10,15) for i in range(8)],'C':[np.random.randint(20,30)for i in range(8)],'D':[np.random.randint(80,100)for i in range(8)]})
>>> df4
A B C D
0 4 10 21 85
1 4 10 27 93
2 3 14 25 95
3 2 10 20 82
4 2 12 22 81
5 4 14 20 90
6 3 12 27 96
7 1 14 25 99
>>> df4.groupby('A').sum() #数据分组计算
B C D
A
1 14 25 99
2 22 42 163
3 26 52 191
4 34 68 268
>>> df4.groupby(['A','B']).mean()
C D
A B
1 14 25 99
2 10 20 82
12 22 81
3 12 27 96
14 25 95
4 10 24 89
14 20 90
>>>
matplotlib绘图
>>> import pandas as pd
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> df=pd.DataFrame(np.random.randn(1000,2),columns=['B','C']).cumsum()
>>> df['A']=pd.Series(list(range(len(df))))
>>> plt.figure()
<Figure size 640x480 with 0 Axes>
>>> df.plot(x='A')
<AxesSubplot:xlabel='A'>
>>> plt.show()
>>> df=pd.DataFrame(np.random.rand(10,4),columns=['a','b','c','d'])
>>> df.plot(kind='bar')
<AxesSubplot:>
>>> plt.show()
>>> df.plot(kind='barh',stacked=True)
<AxesSubplot:>
>>> plt.show()
绘制正弦曲线
>>> import numpy as np
>>> import pylab as pl
>>> t=np.arange(0.0,2.0*np.pi,0.01) #生成数组,0~2π之间,以0.01为步长
>>> s=np.sin(t) #对数组中的所有元素求正弦值,得到新数组
>>> pl.plot(t,s) #画图,以t为横坐标,s为纵坐标
[<matplotlib.lines.Line2D object at 0x0000001DE0454B70>]
>>> pl.xlabel('x') #设置横坐标轴标签
Text(0.5, 0, 'x')
>>> pl.ylabel('y')
Text(0, 0.5, 'y')
>>> pl.title('sin') #设置图形标题
Text(0.5, 1.0, 'sin')
>>> pl.show()
>>> import numpy as np
>>> import pylab as pl
>>> a=np.arange(0,2.0*np.pi,0.1)
>>> b=np.cos(a)
>>> pl.scatter(a,b)
<matplotlib.collections.PathCollection object at 0x0000001DE094D7B8>
>>> pl.show()
>>>
>>> import matplotlib.pylab as pl
>>> import numpy as np
>>> x=np.random.random(100)
>>> y=np.random.random(100)
>>> pl.scatter(x,y,s=x*500,c=u'r',marker=u'*') #s指大小,c指颜色,marker指符号形状
<matplotlib.collections.PathCollection object at 0x0000001DE3018438>
>>> pl.show()
>>>
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> labels='frogs','hogs','dogs','logs'
>>> sizes=[15,30,45,10]
>>> colors=['yellowgreen','gold','#FF0000','lightcoral']
>>> explode=(0,0.1,0,0.1) #使饼状图中第2片和第4片裂开
>>> fig=plt.figure()
>>> ax=fig.gca()
>>> ax.pie(np.random.random(4),explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=90,radius=0.25,center=(0,0),frame=True)
>>> ax.pie(np.random.random(4),explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=90,radius=0.25,center=(1,1),frame=True)
>>> ax.pie(np.random.random(4),explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=90,radius=0.25,center=(0,1),frame=True)
>>> ax.pie(np.random.random(4),explode=explode,labels=labels,colors=colors,autopct='%1.1f%%',shadow=True,startangle=90,radius=0.25,center=(1,0),frame=True)
>>> ax.set_xticks([0,1]) #设置坐标轴刻度
>>> ax.set_yticks([0,1])
>>> ax.set_xticklabels(["Sunny","Cloudy"]) #设置坐标轴刻度上显示的刻度
>>> ax.set_yticklabels(["Dry","Rainy"])
>>> ax.set_xlim((-0.5,1.5)) #设置坐标轴跨度
>>> ax.set_ylim((-0.5,1.5))
>>> ax.set_aspect('equal')
>>> plt.show()
>>> import numpy as np
>>> import pylab as pl
>>> import matplotlib.font_manager as fm
>>> myfont=fm.FontProperties(fname=r'C:\Windows\Fonts\STKAITI.TTF')
>>> t=np.arange(0.0,2.0*np.pi,0.01) #自变量的取值范围
>>> s=np.sin(t) #计算正弦函数值
>>> z=np.cos(t) #计算余弦函数值
>>> pl.plot(t,s,label='正弦')
[<matplotlib.lines.Line2D object at 0x0000001DE3290C88>]
>>> pl.plot(t,z,label='余弦')
[<matplotlib.lines.Line2D object at 0x0000001DE3186588>]
>>> pl.xlabel('x-变量',fontproperties='STKAITI',fontsize=24) #设置x标签
Text(0.5, 0, 'x-变量')
>>> pl.xlabel('y-正弦余弦函数值',fontproperties='STKAITI',fontsize=24)
Text(0.5, 0, 'y-正弦余弦函数值')
>>> pl.title('sin-cos函数图像',fontproperties='STKAITI',fontsize=32) #图形标题
Text(0.5, 1.0, 'sin-cos函数图像')
>>> pl.legend(prop=myfont) #设置图例
<matplotlib.legend.Legend object at 0x0000001DE3290128>
>>> pl.show()
>>> import matplotlib.pyplot as plt
>>> x=np.linspace(0,2*np.pi,500)
>>> y=np.sin(x)
>>> z=np.cos(x*x)
>>> plt.figure(figsize=(8,5)) #标签前后加$将使用内嵌的latex引擎将其显示为公式
<Figure size 800x500 with 0 Axes>
>>> plt.plot(x,y,label='$sin(x)$',color='red',linewidth=2) #红色,2个像素宽
[<matplotlib.lines.Line2D object at 0x0000001DE4473B00>]
>>> plt.plot(x,z,'b--',label='$cos(x^2)$') #蓝色,虚线
[<matplotlib.lines.Line2D object at 0x0000001DE4473588>]
>>> plt.xlabel('Time(s)')
Text(0.5, 0, 'Time(s)')
>>> plt.ylabel('Volt')
Text(0, 0.5, 'Volt')
>>> plt.title('Sin and Cos figure using pyplot')
Text(0.5, 1.0, 'Sin and Cos figure using pyplot')
>>> plt.ylim(-1.2,1.2)
(-1.2, 1.2)
>>> plt.legend() #显示图列
<matplotlib.legend.Legend object at 0x0000001DE31989E8>
>>> plt.show()
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> x=np.linspace(0,2*np.pi,500) #创建自变量数组
>>> y1=np.sin(x) #创建函数值数组
>>> y2=np.cos(x)
>>> y3=np.sin(x*x)
>>> plt.figure(1) #创建图形
<Figure size 640x480 with 0 Axes>
>>> ax1=plt.subplot(2,2,1) #第一行第一列图形
>>> ax2=plt.subplot(2,2,2) #第一行第二列图形
>>> ax3=plt.subplot(2,1,2) #第二行
>>> plt.sca(ax1) #选择ax1
>>> plt.plot(x,y1,color='red') #绘制红色曲线
[<matplotlib.lines.Line2D object at 0x0000001DE3170358>]
>>> plt.ylim(-1.2,1.2) #限制y坐标轴的范围
(-1.2, 1.2)
>>> plt.sca(ax2) #选择ax2
>>> plt.plot(x,y2,'b--') #绘制蓝色曲线
[<matplotlib.lines.Line2D object at 0x0000001DE0ACF860>]
>>> plt.ylim(-1.2,1.2)
(-1.2, 1.2)
>>> plt.sca(ax3) #选择ax3
>>> plt.plot(x,y3,'g--')
[<matplotlib.lines.Line2D object at 0x0000001DE4333550>]
>>> plt.ylim(-1.2,1.2)
(-1.2, 1.2)
>>> plt.show()
>>>
绘制三维参数曲线
>>> import matplotlib as mp1
>>> from mpl_toolkits.mplot3d import Axes3D
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> mp1.rcParams['legend.fontsize']=10 #图例字号
>>> fig=plt.figure()
>>> ax=fig.gca(projection='3d') #三维图形
>>> theta=np.linspace(-4*np.pi,4*np.pi,100)
>>> z=np.linspace(-4,4,100)*0.3 #测试数据
>>> r=z**3+1
>>> x=r*np.sin(theta)
>>> y=r*np.cos(theta)
>>> ax.plot(x,y,z,label='parametric curve')
[<mpl_toolkits.mplot3d.art3d.Line3D object at 0x0000001DE0454F98>]
>>> ax.legend()
<matplotlib.legend.Legend object at 0x0000001DE0454BA8>
>>> plt.show()
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> import mpl_toolkits.mplot3d
>>> x,y=np.mgrid[-2:2:20j,-2:2:20j]
>>> z=50*np.sin(x+y) #测试数据
>>> ax=plt.subplot(111,projection='3d') #三维图形
>>> ax.plot_surface(x,y,z,rstride=2,cstride=1,cmap=plt.cm.Blues_r)
<mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x0000001DE0AB3B38>
>>> ax.set_xlabel('X')
Text(0.5, 0, 'X')
>>> ax.set_ylabel('Y')
Text(0.5, 0, 'Y')
>>> ax.set_zlabel('Z')
Text(0.5, 0, 'Z')
>>> plt.show()
>>>
>>> import numpy as np
>>> import matplotlib.pyplot as plt
>>> import mpl_toolkits.mplot3d
>>> rho,theta=np.mgrid[0:1:40j,0:2*np.pi:40j]
>>> z=rho**2
>>> x=rho*np.cos(theta)
>>> y=rho*np.sin(theta)
>>> ax=pl.subplot(111,projection='3d')
>>> ax.plot_surface(x,y,z)
<mpl_toolkits.mplot3d.art3d.Poly3DCollection object at 0x0000001DE0AC8828>
>>> pl.show()
文件读写
>>> df=pd.DataFrame({'A':[np.random.randint(1,100)for i in range(4)],'B':pd.date_range(start='20130101',periods=4,freq='D'),'C':pd.Series([1,2,3,4],index=['zhang','li','zhou','wang'],dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})
>>> df
A B C D E F
zhang 4 2013-01-01 1.0 3 test foo
li 48 2013-01-02 2.0 3 train foo
zhou 44 2013-01-03 3.0 3 test foo
wang 88 2013-01-04 4.0 3 train foo
>>> df.to_excel('C:\\Users\\***\\Desktop\\test.xlsx',sheet_name='dfg') #将数据保存为Excel文件
创建词云
- 扩展库wordcloud可以用来制作词云
- pillow库提供了图像处理功能,可以结合两者创建词云头像
>>> import random
>>> import string
>>> import wordcloud
>>> def show(s):
#创建wordcloud对象
wc=wordcloud.WordCloud(
r'C:\\windows\\fonts\\simfang.ttf',width=500,height=400,background_color='white',font_step=3,random_state=False,prefer_horizontal=0.9)
#创建并显示词云
t=wc.generate(s)
t.to_image().save('C:\\Users\\***\\Desktop\\t.png')
>>> show('''hello world 董付国 董付国 董付国 abc fgh yhnbgfd 董付国 董付国 董付国董付国 Pyhton great Python Python''')
>>> import string
>>> import random
>>> from PIL import Image
>>> import wordcloud
>>> def create(imgFile,s):
im=Image.open(imgFile)
w,h=im.size
#创建wordcloud对象
wc=wordcloud.WordCloud(r'C:\\windows\\fonts\\simfang.ttf',width=w,height=h,background_color='white',font_step=3,random_state=False,prefer_horizontal=0.9)
#创建并显示词云
t=wc.generate(s)
t=t.to_image()
for w1 in range(w):
for h1 in range(h):
if im.getpixel((w1,h1))[:3] == (255,255,255):
t.putpixel((w1,h1),(255,255,255))
t.save('C:\\Users\\***\\Desktop\\result.png')
>>> chs=string.ascii_letters+string.digits+string.punctuation
>>> s=[''.join((random.choice(chs) for i in range(8))) for j in range(650)]
>>> s=''.join(s)
>>> create('C:\\Users\\***\\Desktop\\1.jpg',s)