python怎么训练分类器_如何以仅训练分类器一次的方式调整NLTK Python代码

importnltkimportreimportcsv#Read the tweets one by one and process itdefprocessTweet(tweet):# process the tweets#convert to lower casetweet=tweet.lower()#Convert www.* or https?://* to URLtweet=re.sub('((www\.[\s]+)|(https?://[^\s]+))','URL',tweet)#Convert @username to AT_USERtweet=re.sub('@[^\s]+','AT_USER',tweet)#Remove additional white spacestweet=re.sub('[\s]+',' ',tweet)#Replace #word with wordtweet=re.sub(r'#([^\s]+)',r'\1',tweet)#trimtweet=tweet.strip('\'"')returntweetdefreplaceTwoOrMore(s):#look for 2 or more repetitions of character and replace with the character itselfpattern=re.compile(r"(.)\1{1,}",re.DOTALL)returnpattern.sub(r"\1\1",s)#end#start getStopWordListdefgetStopWordList(stopWordListFileName):#read the stopwords file and build a liststopWords=[]stopWords.append('AT_USER')stopWords.append('url')stopWords.append('URL')stopWords.append('rt')fp=open(stopWordListFileName)line=fp.readline()whileline:word=line.strip()stopWords.append(word)line=fp.readline()fp.close()returnstopWords#end#start getfeatureVectordefgetFeatureVector(tweet):featureVector=[]#split tweet into wordswords=tweet.split()forwinwords:#replace two or more with two occurrencesw=replaceTwoOrMore(w)#strip punctuationw=w.strip('\'"?,.')#check if the word starts with an alphabetval=re.search(r"^[a-zA-Z][a-zA-Z0-9]*$",w)#ignore if it is a stop wordif(winstopWordsorvalisNone):continueelse:featureVector.append(w.lower())returnfeatureVector#enddefextract_features(tweet):tweet_words=set(tweet)features={}forwordinfeatureList:features['contains(%s)'%word]=(wordintweet_words)returnfeaturesinpTweets=csv.reader(open('sheet3.csv','rb'),delimiter=',')stopWords=getStopWordList('stopwords.txt')featureList=[]# Get tweet wordstweets=[]forrowininpTweets:sentiment=row[0]tweet=row[1]processedTweet=processTweet(tweet)featureVector=getFeatureVector(processedTweet)featureList.extend(featureVector)tweets.append((featureVector,sentiment));#end loop# Remove featureList duplicatesfeatureList=list(set(featureList))# Extract feature vector for all tweets in one shotetraining_set=nltk.classify.util.apply_features(extract_features,tweets)NBClassifier=nltk.NaiveBayesClassifier.train(training_set)ft=open("april2.tsv")line=ft.readline()fo=open("dunno.tsv","w")fo.seek(0,0)whileline:testTweet=lineprocessedTestTweet=processTweet(testTweet)line1=fo.write(NBClassifier.classify(extract_features(getFeatureVector(processedTestTweet)))+"\n");line=ft.readline()fo.close()ft.close()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值