- . :代替任何单个字符
- ^a :匹配所有以字母a开头的字符串
- a& :匹配所有以字母a结尾的字符串
- r"\\" :匹配反斜杠
- [0-9]:任选一个
- [0-9]{3}:[0-9]重复三次
#return true or false
re.search(regex, string);
#return [] exist in string;
re.selectall(regex, string);
将空值替换为“0”的操作:
import numpy as np
matrix = np.genfromtxt("....csv", dtype = 'U75', skip_header = 1, delimiter = ',')
for i in range(np.shape[0]):
column = (matrix[:, i] == '')
matrix[column, i] = '0'
#数据转换
vector = vector.astype(float)
NLTK
import nltk
nltk.download('gutenberg')
Linux平台
Ubuntu 自带python2和python3 设置默认Python版本和切换:
直接执行这两个命令即可:
sudo update-alternatives --install /usr/bin/python python /usr/bin/python2 100
sudo update-alternatives --install /usr/bin/python python /usr/bin/python3 150
如果要切换到Python2,执行:
sudo update-alternatives --config python
按照提示输入选择数字回车即可。
由于Ubuntu自带的pip最高版本达不到下载nltk的要求:(强制重装pip)
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
sudo python get-pip.py --force-reinstall
然后再
pip install nltk
gedit出错:gedit是用户图形界面,在服务器上打不开...所以无法进入gedit编辑器
此时直接修改的配置文件:
JAVA_HOME=/usr/lib/jvm/java1.8
JRE_HOME=/usr/lib/jvm/java1.8/jre
PATH=$JAVA_HOME/bin:$PATH
CLASSPATH=.:$JAVA_HOME/lib/dt.jar:$JAVA_HOME/lib/tools.jar
export JAVA_HOME
export JRE_HOME
export PATH
添加相同的代码就可以,最后java -version(一个-),就成功了。
linux nltk包的位置:/usr/local/lib/python3.6/dist-packages
NLTK Stanford NLP
Python NLTK结合Stanford NLP工具包进行分词、词性标注、句法分析
一些不清楚的地方:
比如说文中说的的stanfordNLTK目录在哪?
作者的资源链接应该标错了,估摸着最近应该得不到回复,我一个一个下载,整理一下:
NLP
一、文本处理
1.1 分句
import nltk
from nltk.corpus import gutenberg
from pprint import pprint
import numpy as np
alice = gutenberg.raw(fileids='carroll-alice.txt')
default_st = nltk.sent_tokenize
print('\nTotal sentences in alice:', len(alice_sentences))
print('First 5 sentences in alice:-')
print(np.array(alice_sentences[0:5]))
punkt_st = nltk.tokenize.PunktSentenceTokenizer()
sample_sentences = punkt_st.tokenize(sample_text)
print(np.array(sample_sentences))
正则表达式:
SENTENCE_TOKENS_PATTERN = r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\.|\?|\!)\s'
regex_st = nltk.tokenize.RegexpTokenizer(
pattern=SENTENCE_TOKENS_PATTERN,
gaps=True)
sample_sentences = regex_st.tokenize(sample_text)
print(np.array(sample_sentences))
1.2 分词
默认:
default_wt = nltk.word_tokenize
words = default_wt(sample_text)
np.array(words)
Treebank
treebank_wt = nltk.TreebankWordTokenizer()
words = treebank_wt.tokenize(sample_text)
np.array(words)
正则表达式:r'\w+'
GAP_PATTERN = r'\s+'
regex_wt = nltk.RegexpTokenizer(pattern=GAP_PATTERN,
gaps=True)
words = regex_wt.tokenize(sample_text)
np.array(words)
word_indices = list(regex_wt.span_tokenize(sample_text))
print(word_indices)#输出分割后字符在原始的位置
print(np.array([sample_text[start:end] for start, end in word_indices]))
#输出每个字符
def tokenize_text(text):
sentences = nltk.sent_tokenize(text)
word_tokens = [nltk.word_tokenize(sentence) for sentence in sentences]
return word_tokens
sents = tokenize_text(sample_text)
np.array(sents)
words = [word for sentence in sents for word in sentence]
np.array(words)
更快的分词分句:
import spacy
nlp = spacy.load('en_core', parse = True, tag=True, entity=True)
text_spacy = nlp(sample_text)
sents = np.array(list(text_spacy.sents))
sent_words = [[word.text for word in sent] for sent in sents]
np.array(sent_words)
words = [word.text for word in text_spacy]
np.array(words)
去除重音字符:
import unicodedata
def remove_accented_chars(text):
text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
return text
remove_accented_chars('Sómě Áccěntěd těxt')
删除特殊字符:
def remove_special_characters(text, remove_digits=False):
pattern = r'[^a-zA-z0-9\s]' if not remove_digits else r'[^a-zA-z\s]'
text = re.sub(pattern, '', text)
return text
remove_special_characters("Well this was fun! What do you think? 123#@!",
remove_digits=True)
拓展略缩词:
from contractions import CONTRACTION_MAP
import re
def expand_contractions(text, contraction_mapping=CONTRACTION_MAP):
contractions_pattern = re.compile('({})'.format('|'.join(contraction_mapping.keys())),
flags=re.IGNORECASE|re.DOTALL)
def expand_match(contraction):
match = contraction.group(0)
first_char = match[0]
expanded_contraction = contraction_mapping.get(match)\
if contraction_mapping.get(match)\
else contraction_mapping.get(match.lower())
expanded_contraction = first_char+expanded_contraction[1:]
return expanded_contraction
expanded_text = contractions_pattern.sub(expand_match, text)
expanded_text = re.sub("'", "", expanded_text)
return expanded_text
缩略词列表:链接:https://pan.baidu.com/s/1qu44acyb6pwMuUtfBqimig 提取码:5rnf