一些小问题

1,如何在 PyCharm 中设置 Python 代码模板


2,PyCharm 中文注释报错 SyntaxError: Non-ASCII character


3,Ubuntu 安装Navicat,界面出现乱码解决方法


4,Ubuntu navicat导入csv文件失败:多半是字段分隔符按照默认的设定成了“定位”,改成逗号(或者换其他几个选项试试)


5,Gensim Word2vec 使用指南


6,自然语言处理工具包spaCy介绍


7,AttributeError: ‘Word2Vec’ object has no attribute ‘syn0’


8,SQLAlchemy Introduce(mysql与它的数据类型对应问题)


9,keras: texts_to_sequences_generator(texts)

from keras.preprocessing.text import Tokenizer

texts=data.x_train
sample_index=0
text_list = texts[sample_index][0]   # 这是一个句子列表,里面是unicode

tokenizer = Tokenizer(word_num_per_sent)
tokenizer.fit_on_texts(text_list)

报错为:

File "/home/sunxiangguo/PycharmProjects/personality/cnn.py", line 85, in <module> tokenizer.fit_on_texts(text_list)
File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 119, in fit_on_texts self.split)
File "/home/sunxiangguo/anaconda2/lib/python2.7/site-packages/keras/preprocessing/text.py", line 38, in text_to_word_sequence text = text.translate(maketrans(filters, split * len(filters)))
TypeError: character mapping must return integer, None or unicode

修正:

from keras.preprocessing.text import Tokenizer

texts=data.x_train
sample_index=0
text_list = texts[sample_index][0]   # 这是一个句子列表,里面是unicode

tokenizer = Tokenizer(word_num_per_sent)
tokenizer.fit_on_texts([s.encode('ascii') for s in text_list])
#tokenizer.fit_on_texts(text_list)

10,编写一个集读取数据库,数据分割与一身的,一劳永逸的数据类

#!/usr/bin/env python
# encoding: utf-8


"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: get_data
@time: 17-7-11 下午1:55
"""

import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sklearn.model_selection import train_test_split
import json


class Data(object):

    def __init__(self,big_five='cEXT'):
        # 永远不变:
        self.engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)
        self.sample_y5 = self._get_sample_y5()
        self.sample_x = self._get_sample_x()

        # 可以用户调节
        self.big_five = big_five
        self.train_size = 0.9

        # 用户调节后自动更新的变量
        self.sample_y = self._get_sample_y() # only change by big_five
        self.x_train=None   # change by big_five and train_size
        self.x_test=None   # change by big_five and train_size
        self.y_train=None   # change by big_five and train_size
        self.y_test=None   # change by big_five and train_size

        self.update_train_test()

    def details(self):
        return {"sample_x": self.sample_x.shape,
           "sample_y5": self.sample_y5.shape,
           "big_five": self.big_five,
           "train_size": self.train_size,
           "sample_y": self.sample_y.shape,
           "x_train": self.x_train.shape,
           "x_test": self.x_test.shape,
           "y_train": self.y_train.shape,
           "y_test": self.y_test.shape}

    def _get_sample_x(self):
        df_all = pd.read_sql_table('table_3', self.engine, columns=['line_text'])  # read essays
        all_text = df_all['line_text']
        sample_x = []
        for text in all_text:
            # get all_line_text in one text
            cut_sentence_list = json.loads(text)  # type:list (from json to list)
            sample_x.append(cut_sentence_list)
        return np.array(sample_x).reshape((-1, 1))  # shape (2467,1)
        # print ("xx:",self.sample_x.shape)

    def _get_sample_y5(self):
        return pd.read_sql_table('essays', self.engine,
                                   columns=['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'])  # read essays

    def _get_sample_y(self):
        return self.sample_y5[self.big_five].reshape((-1, 1))  # shape (2467,1)


    def set_big_five(self,big_five):
        """

        :param big_five: 'cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN'
        :return:
        """
        self.big_five=big_five
        self.sample_y = self._get_sample_y()
        self.update_train_test()

    def set_train_size(self,train_size):
        self.train_size=train_size
        self.update_train_test()

    def update_train_test(self):
        self.x_train,self.x_test,self.y_train,self.y_test = train_test_split(self.sample_x,self.sample_y,
                                                                             random_state=1,train_size=self.train_size)

if __name__ =='__main__':
    data=Data()
    #data = Data()
    print (data.details())

11,编写一个一劳永逸的切割句子的类

#!/usr/bin/python
# -*- coding:utf8 -*-

"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: RCNN
@time: 17-7-13 上午11:46
"""

from sqlalchemy import create_engine    # mysql orm interface,better than mysqldb
import pandas as pd
import spacy    # a NLP model like NLTK,but more industrial.
import json

def cut_sentences(df):
    all_text_name = df["#AUTHID"]  # type pandas.Series:get all text name(match the "#AUTHID" in essays)

    all_text = df["TEXT"]  # type pandas.Series:get all text(match the "TEXT" in essays)

    all_number = all_text_name.index[-1]    # from 0 to len(all_text_name)-1
    for i in xrange(0,all_number+1,1):
        print("start to deal with text ", i ," ...")
        text = all_text[i]  # type str:one of text in all_text

        text_name = all_text_name[i]    # type str:one of text_name in all_text_name

        nlp = spacy.load('en_sm')

        test_doc = nlp(text.decode())

        cut_sentence = []
        for sent in test_doc.sents:     # get each line in the text
            cut_sentence.append(sent.text)
            """
            type sent is spacy.tokens.span.Span, not a string,
            so, we call the member function Span.text to get its unicode form
            """

        cut_sentence_json = json.dumps(cut_sentence)
        line_number = len(cut_sentence)
        input_data_dic = {'text_name': text_name,
                         'line_number':line_number,
                         'line_text': cut_sentence_json
                         }

        input_data = pd.DataFrame(input_data_dic,index=[i],columns=['text_name','line_number','line_text'])

        input_data.to_sql('table_3', engine, if_exists='append', index=False, chunksize=100)
        """
        DataFrame.index will be insert to table by default. We don't want it, so we set the 
        index = False(True default)
        """
        print("text ", i ," finished")

if __name__ =='__main__':
    engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)

    df = pd.read_sql_table('essays', engine,chunksize=5)  # read essays
    for df_iter in df:
        cut_sentences(df_iter)

12,编写一个word2vect训练模块

#!/usr/bin/env python
# encoding: utf-8


"""
@version: python2.7
@author: Xiangguo Sun
@contact: sunxiangguodut@qq.com
@site: http://blog.csdn.net/github_36326955
@software: PyCharm
@file: word2vect
@time: 17-7-10 下午5:00
"""
from sqlalchemy import create_engine    # mysql orm interface,better than mysqldb
import pandas as pd
import spacy    # a NLP model like NLTK,but more industrial.
import json
import gensim
import datetime, time
"""
we use gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)
to train our word vect model without GPUs


参数列表:
min_count=0 修剪内部字典书树
size = 200 神经网络NN层单元数
workers = 4 并行粒度
alpha=0.025


start_time=2017-07-10 19:30

end_time=2017-07-11 05:32:13.441757

totally_time= about 10 hours

ubuntu 16.04 LTS 64bit
python2.7
IDE PyCharm
memory: 7.7GB
Intel Core i7-4790 CPU @ 3.60Ghz x 8

"""
start_time = time.strftime("%Y-%m-%d %H:%M:%S")
with open("./log.txt",'a') as f:
    f.write(str(start_time).decode())

class MySentences(object):
    def __init__(self,df_generator):
        self.df_generator = df_generator

    def __iter__(self):

        all_text = self.df_generator['line_text']
        count = 0
        for text in all_text:
            count = count+1
            now = datetime.datetime.now()
            # with open("./log.txt", 'a') as f:
            #     f.write((str(now)+"..."+str(count)).decode())
            print (str(now) + "..."+str(count))

            # get all_line_text in one text
            cut_sentence_list = json.loads(text)    # type:list (from json to list)


            # step1:首先对每一句话做分词操作,去掉标点符号
            # for example:
            # "hello , there."->['hello','there']
            # 得到句子序列
            # for example
            # sentence1: "hello , there."
            # sentence2: "I'm fine, thanks"
            # we should get: [['hello','there'],["I'm","fine","thanks"]]

            nlp = spacy.load('en_sm')
            stop_word_pos = ["PUNCT", "SPACE", "DET", "ADP"]
            """
            "PUNCT":标点
            "SPACE":空格
            "DET":the
            "ADP": 介词
            """
            #sentences=[]

            for sentence in cut_sentence_list:
                sent=[]
                text_doc = nlp(sentence.decode())
                for token in text_doc:
                    if token.pos_ not in stop_word_pos:
                        sent.append(token.text)     # type: token.text unicode
                #sentences.append(sent)


                yield(sent)

                #print(sentences)



engine = create_engine('mysql+pymysql://root:root@localhost:3306/personality_1', echo=True)

df_all = pd.read_sql_table('table_3', engine)  # read essays


sentences=MySentences(df_all)
model = gensim.models.Word2Vec(sentences,size=200,min_count=0,workers=4)
"""
参数列表:
min_count=0 修剪内部字典书树
size = 200 神经网络NN层单元数
workers = 4 并行粒度
alpha=0.025
"""

path = "./mymodel"
model.save(path)
# # 载入模型语句为:
# new_model = gensim.models.Word2Vec.load(path)
# #print (model.)
# print(new_model.similarity("now", "here"))

13 Error when checking model target: expected activation_2 to have shape (None, 10) but got array with shape (3, 1)

X_train = np.array([[1,2], [6,5], [8,2]])
y_train = np.array([2,3,7])
input_dim = X_train.shape[1]

model = Sequential()

model.add(Dense(output_dim=64, input_dim=input_dim))
model.add(Activation("relu"))
model.add(Dense(output_dim=10))
model.add(Activation("softmax"))
model.compile(loss='categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])

model.fit(X_train, y_train, nb_epoch=5, batch_size=32)

修正:

I used sparse_categorical_crossentropy to solve my problem

14,如何在终端中运行你的django项目
如果你的django项目是在虚拟环境中开发的,那么,在终端运行时,一定要进入虚拟环境中运行,如下图:

sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ source ./activate
(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/personality_web/bin$ cd /home/sunxiangguo/PycharmProjects/personality_web
(personality_web) sunxiangguo@sunxiangguo-ubuntu:~/PycharmProjects/personality_web$ python manage.py runserver
Performing system checks...

System check identified no issues (0 silenced).
July 25, 2017 - 11:43:50
Django version 1.9.13, using settings 'personality_web.settings'
Starting development server at http://127.0.0.1:8000/
Quit the server with CONTROL-C.
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值