TF学习笔记

最新推荐文章于 2022-02-09 16:20:01 发布

1996MZH

最新推荐文章于 2022-02-09 16:20:01 发布

阅读量181

点赞数

分类专栏： NLP

本文链接：https://blog.csdn.net/weixin_41545780/article/details/106740747

版权

NLP 专栏收录该内容

13 篇文章 0 订阅

订阅专栏

import tensorflow as tf

#tf常量
tf.constant('Hello, TensorFlow!')

# Start tf session
sess = tf.Session()

#basic operation
a = tf.constant(2)
b = tf.constant(3)
with tf.Session() as sess:
    print ("a: %i" % sess.run(a), "b: %i" % sess.run(b))
    print ("Addition with constants: %i" % sess.run(a+b))
    print ("Multiplication with constants: %i" % sess.run(a*b))


# Basic Operations with variable as graph input
# The value returned by the constructor represents the output
# of the Variable op. (define as input when running session)
# tf Graph input
# tf.placeholder通常是之后要喂的数据
# holder_name = tf.placeholder(tf.datatype)
a = tf.placeholder(tf.int16)
b = tf.placeholder(tf.int16)

# Define some operations
# 针对placeholder的操作符
# 基本数学加减乘除
# tf.add, tf.subtract, tf.multiply, tf.divide
add = tf.add(a, b)
mul = tf.multiply(a, b)
sub = tf.subtract(a,b)
div = tf.divide(a,b)

# Launch the default graph.
# Launch the default graph.
with tf.Session() as sess:
    # Run every operation with variable input
    print ("Addition with variables: %i" % sess.run(add, feed_dict={a: 2, b: 3}))
    print ("Subtract with variables: %i" % sess.run(sub, feed_dict={a: 2, b: 3}))
    print ("Multiplication with variables: %i" % sess.run(mul, feed_dict={a: 2, b: 3}))
    print ("Divide with variables: %i" % sess.run(div, feed_dict={a: 2, b: 3}))

# ----------------
# More in details:
# Matrix Multiplication from TensorFlow official tutorial

# Create a Constant op that produces a 1x2 matrix.  The op is
# added as a node to the default graph.
#
# The value returned by the constructor represents the output
# of the Constant op.
# 矩阵乘法
# 先初始化矩阵
matrix1 = tf.constant([[3., 3.]])#1行2列
matrix2 = tf.constant([[2.],[2.]])#2行1列

# Create a Matmul op that takes 'matrix1' and 'matrix2' as inputs.
# The returned value, 'product', represents the result of the matrix
# multiplication.
# 矩阵乘法 m1 x m2
# 要求m1: axb  m2: bxc
product = tf.matmul(matrix1, matrix2)


# 矩阵点乘
# 以下两个语句等价
dot_mul1 = matrix1 * matrix2
dot_mul2 = tf.multiply(matrix1 , matrix2)

# To run the matmul op we call the session 'run()' method, passing 'product'
# which represents the output of the matmul op.  This indicates to the call
# that we want to get the output of the matmul op back.
#
# All inputs needed by the op are run automatically by the session.  They
# typically are run in parallel.
#
# The call 'run(product)' thus causes the execution of threes ops in the
# graph: the two constants and matmul.
#
# The output of the op is returned in 'result' as a numpy `ndarray` object.
with tf.Session() as sess:
    result = sess.run(product)
    print result

读取txt文本中的语料数据

假设txt中每行是一条语料+label，如下面的一行
语料为“可以…新浪微博”
label为1
可以直接到编辑部买，地址，北京体育馆路8号，中国体育报业总社院内，后楼51700：羽毛球杂志木有支付宝，木有财付通，在网上订购不支持货到付款么？那么北京哪个实体店有卖12月的《羽毛球》杂志，或者说，能去你们编辑社买不？地址？转发(3)评论(4)12月10日09:17来自新浪微博 1

with open("data/text.txt","r",encoding="utf-8") as reader:
    data=reader.read().splitlines()#txt中存在若干行类似上文的语句
    
'''执行上述语句后，data的内容如下所示
'可以直接到编辑部买，地址，北京体育馆路8号，中国体育报业总社院内，后楼51700：羽毛球杂志木有支付宝，木有财付通，在网上订购不支持货到付款么？那么北京哪个实体店有卖12月的《羽毛球》杂志，或者说，能去你们编辑社买不？地址？转发(3)评论(4)12月10日09:17来自新浪微博\t1',
'''


#而后将读取的每一行的前半部分和后半部分分别放入两个list中
texts=[]
labels=[]
for line in data:
    line=line.split("\t")#line = [str'xxx',int(label)]
    if len(line) ==2 and int(line[1])<2:# 这里演示一个二分类问题，但训练样本并没有认真处理过，所以去掉label大于1的。
        texts.append(line[0])
        labels.append(line[1])

如果需要使用Bert训练，需要制作好Bert所需的文本数据格式
分别为 input_ids input_masks segment_ids
如，对于上句“可以直接到编辑部买，地址，北京体育馆路8号，中国体育报业总社院内，后楼51700：羽毛球杂志木有支付宝，木有财付通，在网上订购不支持货到付款么？那么北京哪个实体店有卖12月的《羽毛球》杂志，或者说，能去你们编辑社买不？地址？转发(3)评论(4)12月10日09:17来自新浪微博”
其中 input_ids为
[101,1377,…,102]#长度为128
input_masks为
[1,1,…,1]#长度为128
segment_ids为
[0,0,…,0]#长度为128

这其中涉及到
1、将bytes转为Unicode编码

text.decode("utf-8", "ignore")

2、数据清洗
去除控制字符和转换空格

  def _clean_text(self, text):
    """Performs invalid character removal and whitespace cleanup on text."""
    output = []
    for char in text:
      cp = ord(char)
      if cp == 0 or cp == 0xfffd or _is_control(char):
        continue
      if _is_whitespace(char):
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

def _is_control(char):
  """Checks whether `chars` is a control character.
In computing and telecommunication, a control character or non-printing character (NPC) is a code point (a number) in a character set, that does not represent a written symbol. They are used as in-band signaling to cause effects other than the addition of a symbol to the text.控制符：LF（换行）、CR（回车）、FF（换页）、DEL（删除）、BS（退格)、BEL（振铃）等；通讯专用字符：SOH（文头）、EOT（文尾）、ACK（确认）等。

  """
  # These are technically control characters but we count them as whitespace
  # characters.
  if char == "\t" or char == "\n" or char == "\r":
    return False
  cat = unicodedata.category(char)
  if cat.startswith("C"):
    return True
  return False

def _is_whitespace(char):
  """Checks whether `chars` is a whitespace character."""
  # \t, \n, and \r are technically contorl characters but we treat them
  # as whitespace since they are generally considered as such.
  if char == " " or char == "\t" or char == "\n" or char == "\r":
    return True
  cat = unicodedata.category(char)
  if cat == "Zs":
    return True
  return False

3、中文分词
对中文插入结果为[" “,char,” "]

  def _tokenize_chinese_chars(self, text):
    """Adds whitespace around any CJK character."""
    output = []
    for char in text:
      cp = ord(char)
      if self._is_chinese_char(cp):
        output.append(" ")
        output.append(char)
        output.append(" ")
      else:
        output.append(char)
    return "".join(output)

  def _is_chinese_char(self, cp):
    """Checks whether CP is the codepoint of a CJK character."""
    # This defines a "chinese character" as anything in the CJK Unicode block:
    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
    #
    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
    # despite its name. The modern Korean Hangul alphabet is a different block,
    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
    # space-separated words, so they are not treated specially and handled
    # like the all of the other languages.
    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
        (cp >= 0x3400 and cp <= 0x4DBF) or  #
        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
        (cp >= 0x2B820 and cp <= 0x2CEAF) or
        (cp >= 0xF900 and cp <= 0xFAFF) or  #
        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
      return True

    return False

def whitespace_tokenize(text):
  """Runs basic whitespace cleaning and splitting on a peice of text."""
  text = text.strip()
  if not text:
    return []
  tokens = text.split()
  return tokens#

tensorboard的用法

在sesson之前

os.mkdir('logs')
writer = tf.summary.FileWriter('logs', tf.get_default_graph())
writer.close()

而后在cmd中切到logs目录的上层

tensorboard -logdir logs

这时系统会返回一个地址，形如
TensorBoard 1.8.0 at http://D-C02200BK-1815:6006 (Press CTRL+C to quit)
在chrome里打开http://D-C02200BK-1815:6006 即可

1996MZH

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
TF学习笔记

tfimport tensorflow as tf#tf常量tf.constant('Hello, TensorFlow!')# Start tf sessionsess = tf.Session()#basic operationa = tf.constant(2)b = tf.constant(3)with tf.Session() as sess: print ("a: %i" % sess.run(a), "b: %i" % sess.run(b)) prin
复制链接

扫一扫

专栏目录