import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
import re
import inspect
import tensorflow as tf
from tensorflow import keras
# import tensorflow.keras.backend as Kfrom sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import nltk
import datetime
import transformers
from transformers import BertConfig,TFBertPreTrainedModel,BertTokenizer,TFBertMainLayer,TFBertModel
print("tf_version_ : ",tf.__version__)print("transformers:",transformers.__version__)
# data_clean
stopwords_english = stopwords.words("english")# print(stopwords_english)defcleanword(s):
s = s.lower()
temp = re.findall("http\S*",s)for deletStr in temp:if deletStr !="":
s = s.replace(deletStr," ")
temp = re.findall("@\S*",s)for deletStr in temp:if deletStr !="":
s = s.replace(deletStr," ")
temp = re.findall("\d*",s)for deletStr in temp:if deletStr !="":
s = s.replace(deletStr," ")
temp = re.findall("\x89\S*",s)for deletStr in temp:if deletStr !="":
s = s.replace(deletStr[:5]," ")
s = s.replace("\n"," ")
s = s.replace(","," ")
s = s.replace("?"," ")
s = s.replace("..."," ")
s = s.replace("."," ")
s = s.replace("["," ")
s = s.replace("]"," ")
s = s.replace("!"," ")
s = s.replace(":"," ")
s = s.replace("-"," ")
s = s.replace("#"," ")
s = s.replace("|"," ")
s = s.replace("("," ")
s = s.replace(")"," ")
s = s.replace(";"," ")
s = s.replace("="," ")
s = s.replace(">"," ")
s = s.replace("<"," ")
s = s.replace("/"," ")#delet conntinue " "
s_new =""
word =""for i inrange(len(s)):if s[i]!=" ":
word += s[i]else:if word !="":
s_new = s_new +" "+ word
word =""if word !="":
s_new += word
s_new = s_new.strip()return s_new
data_test['text']= data_test['text'].apply(cleanword)
data_train['text']= data_train['text'].apply(cleanword)
Calling BertTokenizer.from_pretrained() with the path to a single file or url is deprecated
100%|██████████| 7613/7613 [00:02<00:00, 3689.10it/s]
100%|██████████| 3263/3263 [00:00<00:00, 3741.57it/s]
word_len_percent: 36.0
model = TweetBERT()
optimizer = keras.optimizers.Adam(learning_rate=1e-5)
loss ="binary_crossentropy"
model.compile(loss=loss,optimizer=optimizer,metrics=["accuracy"])
WARNING:tensorflow:Entity <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f74d8a65400>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f74d8a65400>>, which Python reported as:
def call(self,inputs):
input_id, input_mask,input_segment = inputs
sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
concat_hidden = self.concat(([h12,h11,h10,h09]))
x = self.avgpool(concat_hidden)
# x = sequence_output[:,0,:]
x = self.dropout(x)
x = self.output_(x)
return x
This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f74d8a65400>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f74d8a65400>>, which Python reported as:
def call(self,inputs):
input_id, input_mask,input_segment = inputs
sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_segment)
h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
concat_hidden = self.concat(([h12,h11,h10,h09]))
x = self.avgpool(concat_hidden)
# x = sequence_output[:,0,:]
x = self.dropout(x)
x = self.output_(x)
return x
This may be caused by multiline strings or comments not indented at the same level as the code.
Train on 6090 samples, validate on 1523 samples
Epoch 1/3
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_7/bert/pooler/dense/kernel:0', 'tf_bert_model_7/bert/pooler/dense/bias:0'] when minimizing the loss.
WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_model_7/bert/pooler/dense/kernel:0', 'tf_bert_model_7/bert/pooler/dense/bias:0'] when minimizing the loss.
6080/6090 [============================>.] - ETA: 0s - loss: 0.4599 - accuracy: 0.7929
Epoch 00001: val_accuracy improved from -inf to 0.82928, saving model to /home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/model/bert-base/saveModelWeightCheckpoint
6090/6090 [==============================] - 70s 12ms/sample - loss: 0.4602 - accuracy: 0.7928 - val_loss: 0.3834 - val_accuracy: 0.8293
Epoch 2/3
6080/6090 [============================>.] - ETA: 0s - loss: 0.3485 - accuracy: 0.8546
Epoch 00002: val_accuracy did not improve from 0.82928
6090/6090 [==============================] - 41s 7ms/sample - loss: 0.3483 - accuracy: 0.8548 - val_loss: 0.3990 - val_accuracy: 0.8240
Epoch 3/3
6080/6090 [============================>.] - ETA: 0s - loss: 0.2758 - accuracy: 0.8914
Epoch 00003: val_accuracy did not improve from 0.82928
6090/6090 [==============================] - 41s 7ms/sample - loss: 0.2756 - accuracy: 0.8916 - val_loss: 0.4515 - val_accuracy: 0.8267
# load_best_model
model = TweetBERT()
model.load_weights(path_save_model)print(new_model)# model.summary()
<__main__.TweetBERT object at 0x7f749ad53b38>
data = pd.DataFrame(history.history).plot()
plt.show()
result = model.predict(test_input)print(result)
WARNING:tensorflow:Entity <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f749ad53b38>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f749ad53b38>>, which Python reported as:
def call(self,inputs):
input_id, input_mask,input_atn = inputs
sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
concat_hidden = self.concat(([h12,h11,h10,h09]))
x = self.avgpool(concat_hidden)
# x = sequence_output[:,0,:]
x = self.dropout(x)
x = self.output_(x)
return x
This may be caused by multiline strings or comments not indented at the same level as the code.
WARNING: Entity <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f749ad53b38>> could not be transformed and will be executed as-is. Please report this to the AutoGraph team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output. Cause: Failed to parse source code of <bound method TweetBERT.call of <__main__.TweetBERT object at 0x7f749ad53b38>>, which Python reported as:
def call(self,inputs):
input_id, input_mask,input_atn = inputs
sequence_output, pooler_output, hidden_states = self.bert_model(input_id,attention_mask=input_mask,token_type_ids=input_atn)
h12 = tf.reshape(hidden_states[-1][:,0],(-1,1,self.hidden_size))
h11 = tf.reshape(hidden_states[-2][:,0],(-1,1,self.hidden_size))
h10 = tf.reshape(hidden_states[-3][:,0],(-1,1,self.hidden_size))
h09 = tf.reshape(hidden_states[-4][:,0],(-1,1,self.hidden_size))
concat_hidden = self.concat(([h12,h11,h10,h09]))
x = self.avgpool(concat_hidden)
# x = sequence_output[:,0,:]
x = self.dropout(x)
x = self.output_(x)
return x
This may be caused by multiline strings or comments not indented at the same level as the code.
[[0.5131391 ]
[0.9968698 ]
[0.98543626]
...
[0.99892104]
[0.9709744 ]
[0.98978955]]
#output submit# date = datetime.datetime.now().strftime("%Y%m%d")# path_save_submit = "/home/lowry/pro/kaggle_tweets/kaggle_tweets_emotion/result/"+date+"largebert"+".csv"# submit = result.round()# submit = [int(li[0]) for li in submit]# submit_data = pd.DataFrame({"id":data_test.id,"target":submit})# submit_data.to_csv(path_save_submit,index=False)
# train_input_1 = [train_input[0][:10],train_input[1][:10],train_input[2][:10]]# re = model(train_input_1)