tf
import tensorflow as tf
#tf常量
tf.constant('Hello, TensorFlow!')
# Start tf session
sess = tf.Session()
#basic operation
a = tf.constant(2)
b = tf.constant(3)
with tf.Session() as sess:
print ("a: %i" % sess.run(a), "b: %i" % sess.run(b))
print ("Addition with constants: %i" % sess.run(a+b))
print ("Multiplication with constants: %i" % sess.run(a*b))
# Basic Operations with variable as graph input
# The value returned by the constructor represents the output
# of the Variable op. (define as input when running session)
# tf Graph input
# tf.placeholder通常是之后要喂的数据
# holder_name = tf.placeholder(tf.datatype)
a = tf.placeholder(tf.int16)
b = tf.placeholder(tf.int16)
# Define some operations
# 针对placeholder的操作符
# 基本数学加减乘除
# tf.add, tf.subtract, tf.multiply, tf.divide
add = tf.add(a, b)
mul = tf.multiply(a, b)
sub = tf.subtract(a,b)
div = tf.divide(a,b)
# Launch the default graph.
# Launch the default graph.
with tf.Session() as sess:
# Run every operation with variable input
print ("Addition with variables: %i" % sess.run(add, feed_dict={a: 2, b: 3}))
print ("Subtract with variables: %i" % sess.run(sub, feed_dict={a: 2, b: 3}))
print ("Multiplication with variables: %i" % sess.run(mul, feed_dict={a: 2, b: 3}))
print ("Divide with variables: %i" % sess.run(div, feed_dict={a: 2, b: 3}))
# ----------------
# More in details:
# Matrix Multiplication from TensorFlow official tutorial
# Create a Constant op that produces a 1x2 matrix. The op is
# added as a node to the default graph.
#
# The value returned by the constructor represents the output
# of the Constant op.
# 矩阵乘法
# 先初始化矩阵
matrix1 = tf.constant([[3., 3.]])#1行2列
matrix2 = tf.constant([[2.],[2.]])#2行1列
# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.
# The returned value, 'product', represents the result of the matrix
# multiplication.
# 矩阵乘法 m1 x m2
# 要求m1: axb m2: bxc
product = tf.matmul(matrix1, matrix2)
# 矩阵点乘
# 以下两个语句等价
dot_mul1 = matrix1 * matrix2
dot_mul2 = tf.multiply(matrix1 , matrix2)
# To run the matmul op we call the session 'run()' method, passing 'product'
# which represents the output of the matmul op. This indicates to the call
# that we want to get the output of the matmul op back.
#
# All inputs needed by the op are run automatically by the session. They
# typically are run in parallel.
#
# The call 'run(product)' thus causes the execution of threes ops in the
# graph: the two constants and matmul.
#
# The output of the op is returned in 'result' as a numpy `ndarray` object.
with tf.Session() as sess:
result = sess.run(product)
print result
读取txt文本中的语料数据
假设txt中每行是一条语料+label,如下面的一行
语料为“可以…新浪微博”
label为1
可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博 1
with open("data/text.txt","r",encoding="utf-8") as reader:
data=reader.read().splitlines()#txt中存在若干行类似上文的语句
'''执行上述语句后,data的内容如下所示
'可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博\t1',
'''
#而后将读取的每一行的前半部分和后半部分分别放入两个list中
texts=[]
labels=[]
for line in data:
line=line.split("\t")#line = [str'xxx',int(label)]
if len(line) ==2 and int(line[1])<2:# 这里演示一个二分类问题,但训练样本并没有认真处理过,所以去掉label大于1的。
texts.append(line[0])
labels.append(line[1])
如果需要使用Bert训练,需要制作好Bert所需的文本数据格式
分别为 input_ids input_masks segment_ids
如,对于上句“可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博”
其中 input_ids为
[101,1377,…,102]#长度为128
input_masks为
[1,1,…,1]#长度为128
segment_ids为
[0,0,…,0]#长度为128
这其中涉及到
1、将bytes转为Unicode编码
text.decode("utf-8", "ignore")
2、数据清洗
去除控制字符和转换空格
def _clean_text(self, text):
"""Performs invalid character removal and whitespace cleanup on text."""
output = []
for char in text:
cp = ord(char)
if cp == 0 or cp == 0xfffd or _is_control(char):
continue
if _is_whitespace(char):
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_control(char):
"""Checks whether `chars` is a control character.
In computing and telecommunication, a control character or non-printing character (NPC) is a code point (a number) in a character set, that does not represent a written symbol. They are used as in-band signaling to cause effects other than the addition of a symbol to the text.控制符:LF(换行)、CR(回车)、FF(换页)、DEL(删除)、BS(退格)、BEL(振铃)等;通讯专用字符:SOH(文头)、EOT(文尾)、ACK(确认)等。
"""
# These are technically control characters but we count them as whitespace
# characters.
if char == "\t" or char == "\n" or char == "\r":
return False
cat = unicodedata.category(char)
if cat.startswith("C"):
return True
return False
def _is_whitespace(char):
"""Checks whether `chars` is a whitespace character."""
# \t, \n, and \r are technically contorl characters but we treat them
# as whitespace since they are generally considered as such.
if char == " " or char == "\t" or char == "\n" or char == "\r":
return True
cat = unicodedata.category(char)
if cat == "Zs":
return True
return False
3、中文分词
对中文 插入结果为[" “,char,” "]
def _tokenize_chinese_chars(self, text):
"""Adds whitespace around any CJK character."""
output = []
for char in text:
cp = ord(char)
if self._is_chinese_char(cp):
output.append(" ")
output.append(char)
output.append(" ")
else:
output.append(char)
return "".join(output)
def _is_chinese_char(self, cp):
"""Checks whether CP is the codepoint of a CJK character."""
# This defines a "chinese character" as anything in the CJK Unicode block:
# https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
#
# Note that the CJK Unicode block is NOT all Japanese and Korean characters,
# despite its name. The modern Korean Hangul alphabet is a different block,
# as is Japanese Hiragana and Katakana. Those alphabets are used to write
# space-separated words, so they are not treated specially and handled
# like the all of the other languages.
if ((cp >= 0x4E00 and cp <= 0x9FFF) or #
(cp >= 0x3400 and cp <= 0x4DBF) or #
(cp >= 0x20000 and cp <= 0x2A6DF) or #
(cp >= 0x2A700 and cp <= 0x2B73F) or #
(cp >= 0x2B740 and cp <= 0x2B81F) or #
(cp >= 0x2B820 and cp <= 0x2CEAF) or
(cp >= 0xF900 and cp <= 0xFAFF) or #
(cp >= 0x2F800 and cp <= 0x2FA1F)): #
return True
return False
def whitespace_tokenize(text):
"""Runs basic whitespace cleaning and splitting on a peice of text."""
text = text.strip()
if not text:
return []
tokens = text.split()
return tokens#
tensorboard的用法
在sesson之前
os.mkdir('logs')
writer = tf.summary.FileWriter('logs', tf.get_default_graph())
writer.close()
而后在cmd中切到logs目录的上层
tensorboard -logdir logs
这时系统会返回一个地址,形如
TensorBoard 1.8.0 at http://D-C02200BK-1815:6006 (Press CTRL+C to quit)
在chrome里打开http://D-C02200BK-1815:6006 即可