2,PyCharm 中文注释报错 SyntaxError: Non-ASCII character
4,Ubuntu navicat导入csv文件失败:多半是字段分隔符按照默认的设定成了“定位”,改成逗号(或者换其他几个选项试试)
7,AttributeError: ‘Word2Vec’ object has no attribute ‘syn0’
8,SQLAlchemy Introduce(mysql与它的数据类型对应问题)
9,keras: texts_to_sequences_generator(texts)
from keras.preprocessing.text import Tokenizer
texts=data.x_train
sample_index=0
text_list = texts[sample_index][0] # 这是一个句子列表,里面是unicode
tokenizer = Tokenizer(word_num_per_sent)
tokenizer.fit_on_texts(text_list)
报错为:
File "/home/sunxiangguo/PycharmProjects/personality/cnn.py", line 85, in <module> tokenizer.fit_on_texts(text_list)
File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 119, in fit_on_texts self.split)
File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 38, in text_to_word_sequence text = text.translate(maketrans(filters, split * len(filters)))
TypeError: character mapping must return integer, None or unicode
修正:
from keras.preprocessing.text import Tokenizer
texts=data.x_train
sample_index=0
text_list = texts[sample_index][0] # 这是一个句子列表,里面是unicode
tokenizer = Tokenizer(word_num_per_sent)
tokenizer.fit_on_texts([s.encode('ascii') for s in text_list])
#tokenizer.fit_on_texts(text_list)
10,编写一个集读取数据库,数据分割与一身的,一劳永逸的数据类
#!/usr/bin/env python
# encoding: utf-8
"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: get_data
@time: 17-7-11 下午1:55
"""
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
import json
class Data(object):
def __init__(self,big_five='cEXT'):
# 永远不变:
self.engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)
self.sample_y5 = self._get_sample_y5()
self.sample_x = self._get_sample_x()
# 可以用户调节
self.big_five = big_five
self.train_size = 0.9
# 用户调节后自动更新的变量
self.sample_y = self._get_sample_y() # only change by big_five
self.x_train=None # change by big_five and train_size
self.x_test=None # change by big_five and train_size
self.y_train=None # change by big_five and train_size
self.y_test=None # change by big_five and train_size
self.update_train_test()
def details(self):
return {"sample_x": self.sample_x.shape,
"sample_y5": self.sample_y5.shape,
"big_five": self.big_five,
"train_size": self.train_size,
"sample_y": self.sample_y.shape,
"x_train": self.x_train.shape,
"x_test": self.x_test.shape,
"y_train": self.y_train.shape,
"y_test": self.y_test.shape}
def _get_sample_x(self):
df_all = pd.read_sql_table('table_3', self.engine, columns=['line_text']) # read essays
all_text = df_all['line_text']
sample_x = []
for text in all_text:
# get all_line_text in one text
cut_sentence_list = json.loads(text) # type:list (from json to list)
sample_x.append(cut_sentence_list)
return np.array(sample_x).reshape((-1, 1)) # shape (2467,1)
# print ("xx:",self.sample_x.shape)
def _get_sample_y5(self):
return pd.read_sql_table('essays', self.engine,
columns=['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']) # read essays
def _get_sample_y(self):
return self.sample_y5[self.big_five].reshape((-1, 1)) # shape (2467,1)
def set_big_five(self,big_five):
"""
:param big_five: 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'
:return:
"""
self.big_five=big_five
self.sample_y = self._get_sample_y()
self.update_train_test()
def set_train_size(self,train_size):
self.train_size=train_size
self.update_train_test()
def update_train_test(self):
self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.sample_x,self.sample_y,
random_state=1,train_size=self.train_size)
if __name__ =='__main__':
data=Data()
#data = Data()
print (data.details())
11,编写一个一劳永逸的切割句子的类
#!/usr/bin/python
# -*- coding:utf8 -*-
"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: RCNN
@time: 17-7-13 上午11:46
"""
from sqlalchemy import create_engine # mysql orm interface,better than mysqldb
import pandas as pd
import spacy # a NLP model like NLTK,but more industrial.
import json
def cut_sentences(df):
all_text_name = df["#AUTHID"] # type pandas.Series:get all text name(match the "#AUTHID" in essays)
all_text = df["TEXT"] # type pandas.Series:get all text(match the "TEXT" in essays)
all_number = all_text_name.index[-1] # from 0 to len(all_text_name)-1
for i in xrange(0,all_number+1,1):
print("start to deal with text ", i ," ...")
text = all_text[i] # type str:one of text in all_text
text_name = all_text_name[i] # type str:one of text_name in all_text_name
nlp = spacy.load('en_sm')
test_doc = nlp(text.decode())
cut_sentence = []
for sent in test_doc.sents: # get each line in the text
cut_sentence.append(sent.text)
"""
type sent is spacy.tokens.span.Span, not a string,
so, we call the member function Span.text to get its unicode form
"""
cut_sentence_json = json.dumps(cut_sentence)
line_number = len(cut_sentence)
input_data_dic = {'text_name': text_name,
'line_number':line_number,
'line_text': cut_sentence_json
}
input_data = pd.DataFrame(input_data_dic,index=[i],columns=['text_name','line_number','line_text'])
input_data.to_sql('table_3', engine, if_exists='append', index=False, chunksize=100)
"""
DataFrame.index will be insert to table by default. We don't want it, so we set the
index = False(True default)
"""
print("text ", i ," finished")
if __name__ =='__main__':
engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)
df = pd.read_sql_table('essays', engine,chunksize=5) # read essays
for df_iter in df:
cut_sentences(df_iter)
12,编写一个word2vect训练模块
#!/usr/bin/env python
# encoding: utf-8
"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: word2vect
@time: 17-7-10 下午5:00
"""
from sqlalchemy import create_engine # mysql orm interface,better than mysqldb
import pandas as pd
import spacy # a NLP model like NLTK,but more industrial.
import json
import gensim
import datetime, time
"""
we use gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)
to train our word vect model without GPUs
参数列表:
min_count=0 修剪内部字典书树
size = 200 神经网络NN层单元数
workers = 4 并行粒度
alpha=0.025
start_time=2017-07-10 19:30
end_time=2017-07-11 05:32:13.441757
totally_time= about 10 hours
ubuntu 16.04 LTS 64bit
python2.7
IDE PyCharm
memory: 7.7GB
Intel Core i7-4790 CPU @ 3.60Ghz x 8
"""
start_time = time.strftime("%Y-%m-%d %H:%M:%S")
with open("./log.txt",'a') as f:
f.write(str(start_time).decode())
class MySentences(object):
def __init__(self,df_generator):
self.df_generator = df_generator
def __iter__(self):
all_text = self.df_generator['line_text']
count = 0
for text in all_text:
count = count+1
now = datetime.datetime.now()
# with open("./log.txt", 'a') as f:
# f.write((str(now)+"..."+str(count)).decode())
print (str(now) + "..."+str(count))
# get all_line_text in one text
cut_sentence_list = json.loads(text) # type:list (from json to list)
# step1:首先对每一句话做分词操作,去掉标点符号
# for example:
# "hello , there."->['hello','there']
# 得到句子序列
# for example
# sentence1: "hello , there."
# sentence2: "I'm fine, thanks"
# we should get: [['hello','there'],["I'm","fine","thanks"]]
nlp = spacy.load('en_sm')
stop_word_pos = ["PUNCT", "SPACE", "DET", "ADP"]
"""
"PUNCT":标点
"SPACE":空格
"DET":the
"ADP": 介词
"""
#sentences=[]
for sentence in cut_sentence_list:
sent=[]
text_doc = nlp(sentence.decode())
for token in text_doc:
if token.pos_ not in stop_word_pos:
sent.append(token.text) # type: token.text unicode
#sentences.append(sent)
yield(sent)
#print(sentences)
engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)
df_all = pd.read_sql_table('table_3', engine) # read essays
sentences=MySentences(df_all)
model = gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)
"""
参数列表:
min_count=0 修剪内部字典书树
size = 200 神经网络NN层单元数
workers = 4 并行粒度
alpha=0.025
"""
path = "./mymodel"
model.save(path)
# # 载入模型语句为:
# new_model = gensim.models.Word2Vec.load(path)
# #print (model.)
# print(new_model.similarity("now", "here"))
13 Error when checking model target: expected activation_2 to have shape (None, 10) but got array with shape (3, 1)
X_train = np.array([[1,2], [6,5], [8,2]])
y_train = np.array([2,3,7])
input_dim = X_train.shape[1]
model = Sequential()
model.add(Dense(output_dim=64, input_dim=input_dim))
model.add(Activation("relu"))
model.add(Dense(output_dim=10))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(X_train, y_train, nb_epoch=5, batch_size=32)
修正:
I used sparse_categorical_crossentropy to solve my problem
14,如何在终端中运行你的django项目
如果你的django项目是在虚拟环境中开发的,那么,在终端运行时,一定要进入虚拟环境中运行,如下图:
sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ source ./activate
(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ cd /home/sunxiangguo/PycharmProjects/personality_web
(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/PycharmProjects/personality_web$ python manage.py runserver
Performing system checks...
System check identified no issues (0 silenced).
July 25, 2017 - 11:43:50
Django version 1.9.13, using settings 'personality_web.settings'
Starting development server at http://127.0.0.1:8000/
Quit the server with CONTROL-C.