TF学习笔记

tf

import tensorflow as tf

#tf常量
tf.constant('Hello, TensorFlow!')

# Start tf session
sess = tf.Session()

#basic operation
a = tf.constant(2)
b = tf.constant(3)
with tf.Session() as sess:
    print ("a: %i" % sess.run(a), "b: %i" % sess.run(b))
    print ("Addition with constants: %i" % sess.run(a+b))
    print ("Multiplication with constants: %i" % sess.run(a*b))


# Basic Operations with variable as graph input
# The value returned by the constructor represents the output
# of the Variable op. (define as input when running session)
# tf Graph input
# tf.placeholder通常是之后要喂的数据
# holder_name = tf.placeholder(tf.datatype)
a = tf.placeholder(tf.int16)
b = tf.placeholder(tf.int16)

# Define some operations
# 针对placeholder的操作符
# 基本数学加减乘除
# tf.add, tf.subtract, tf.multiply, tf.divide
add = tf.add(a, b)
mul = tf.multiply(a, b)
sub = tf.subtract(a,b)
div = tf.divide(a,b)

# Launch the default graph.
# Launch the default graph.
with tf.Session() as sess:
    # Run every operation with variable input
    print ("Addition with variables: %i" % sess.run(add, feed_dict={a: 2, b: 3}))
    print ("Subtract with variables: %i" % sess.run(sub, feed_dict={a: 2, b: 3}))
    print ("Multiplication with variables: %i" % sess.run(mul, feed_dict={a: 2, b: 3}))
    print ("Divide with variables: %i" % sess.run(div, feed_dict={a: 2, b: 3}))

# ----------------
# More in details:
# Matrix Multiplication from TensorFlow official tutorial

# Create a Constant op that produces a 1x2 matrix.  The op is
# added as a node to the default graph.
#
# The value returned by the constructor represents the output
# of the Constant op.
# 矩阵乘法
# 先初始化矩阵
matrix1 = tf.constant([[3., 3.]])#1行2列
matrix2 = tf.constant([[2.],[2.]])#2行1列

# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.
# The returned value, 'product', represents the result of the matrix
# multiplication.
# 矩阵乘法 m1 x m2
# 要求m1: axb  m2: bxc
product = tf.matmul(matrix1, matrix2)


# 矩阵点乘
# 以下两个语句等价
dot_mul1 = matrix1 * matrix2
dot_mul2 = tf.multiply(matrix1 , matrix2)

# To run the matmul op we call the session 'run()' method, passing 'product'
# which represents the output of the matmul op.  This indicates to the call
# that we want to get the output of the matmul op back.
#
# All inputs needed by the op are run automatically by the session.  They
# typically are run in parallel.
#
# The call 'run(product)' thus causes the execution of threes ops in the
# graph: the two constants and matmul.
#
# The output of the op is returned in 'result' as a numpy `ndarray` object.
with tf.Session() as sess:
    result = sess.run(product)
    print result




读取txt文本中的语料数据

假设txt中每行是一条语料+label,如下面的一行
语料为“可以…新浪微博”
label为1
可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博 1

with open("data/text.txt","r",encoding="utf-8") as reader:
    data=reader.read().splitlines()#txt中存在若干行类似上文的语句
    
'''执行上述语句后,data的内容如下所示
'可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博\t1',
'''


#而后将读取的每一行的前半部分和后半部分分别放入两个list中
texts=[]
labels=[]
for line in data:
    line=line.split("\t")#line = [str'xxx',int(label)]
    if len(line) ==2 and int(line[1])<2:# 这里演示一个二分类问题,但训练样本并没有认真处理过,所以去掉label大于1的。
        texts.append(line[0])
        labels.append(line[1])

如果需要使用Bert训练,需要制作好Bert所需的文本数据格式
分别为 input_ids input_masks segment_ids
如,对于上句“可以直接到编辑部买,地址,北京体育馆路8号,中国体育报业总社院内,后楼51700:羽毛球杂志木有支付宝,木有财付通,在网上订购不支持货到付款么?那么北京哪个实体店有卖12月的《羽毛球》杂志,或者说,能去你们编辑社买不?地址?转发(3)评论(4)12月10日09:17来自新浪微博”
其中 input_ids为
[101,1377,…,102]#长度为128
input_masks为
[1,1,…,1]#长度为128
segment_ids为
[0,0,…,0]#长度为128

这其中涉及到
1、将bytes转为Unicode编码

text.decode("utf-8", "ignore")

2、数据清洗
去除控制字符和转换空格

  def _clean_text(self, text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
      cp = ord(char)
      if cp == 0 or cp == 0xfffd or _is_control(char):
        continue
      if _is_whitespace(char):
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

def _is_control(char):
  """Checks whether `chars` is a control character.
In computing and telecommunication, a control character or non-printing character (NPC) is a code point (a number) in a character set, that does not represent a written symbol. They are used as in-band signaling to cause effects other than the addition of a symbol to the text.控制符:LF(换行)、CR(回车)、FF(换页)、DEL(删除)、BS(退格)、BEL(振铃)等;通讯专用字符:SOH(文头)、EOT(文尾)、ACK(确认)等。

  """
  # These are technically control characters but we count them as whitespace
  # characters.
  if char == "\t" or char == "\n" or char == "\r":
    return False
  cat = unicodedata.category(char)
  if cat.startswith("C"):
    return True
  return False

def _is_whitespace(char):
  """Checks whether `chars` is a whitespace character."""
  # \t, \n, and \r are technically contorl characters but we treat them
  # as whitespace since they are generally considered as such.
  if char == " " or char == "\t" or char == "\n" or char == "\r":
    return True
  cat = unicodedata.category(char)
  if cat == "Zs":
    return True
  return False

3、中文分词
对中文 插入结果为[" “,char,” "]

  def _tokenize_chinese_chars(self, text):
    """Adds whitespace around any CJK character."""
    output = []
    for char in text:
      cp = ord(char)
      if self._is_chinese_char(cp):
        output.append(" ")
        output.append(char)
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

  def _is_chinese_char(self, cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
        (cp >= 0x3400 and cp <= 0x4DBF) or  #
        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
        (cp >= 0x2B820 and cp <= 0x2CEAF) or
        (cp >= 0xF900 and cp <= 0xFAFF) or  #
        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
      return True

    return False
def whitespace_tokenize(text):
  """Runs basic whitespace cleaning and splitting on a peice of text."""
  text = text.strip()
  if not text:
    return []
  tokens = text.split()
  return tokens#

tensorboard的用法

在sesson之前

os.mkdir('logs')
writer = tf.summary.FileWriter('logs', tf.get_default_graph())
writer.close()

而后在cmd中切到logs目录的上层

tensorboard -logdir logs

这时系统会返回一个地址,形如
TensorBoard 1.8.0 at http://D-C02200BK-1815:6006 (Press CTRL+C to quit)
在chrome里打开http://D-C02200BK-1815:6006 即可

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值