当时看到shell文件下面有这样一个函数,当时觉得使用起来还是很简单的,而且处理文本速度蛮快的。
normalize_text() {
awk '{print tolower($0);}' | sed -e "s/’/'/g" -e "s/′/'/g" -e "s/''/ /g" -e "s/'/ ' /g" -e "s/“/\"/g" -e "s/”/\"/g" \
-e 's/"/ " /g' -e 's/\./ \. /g' -e 's/<br \/>/ /g' -e 's/, / , /g' -e 's/(/ ( /g' -e 's/)/ ) /g' -e 's/\!/ \! /g' \
-e 's/\?/ \? /g' -e 's/\;/ /g' -e 's/\:/ /g' -e 's/-/ - /g' -e 's/=/ /g' -e 's/=/ /g' -e 's/*/ /g' -e 's/|/ /g' \
-e 's/«/ /g' | tr 0-9 " "
}
normalize_text < news.2012.en.shuffled > data.txt
然后就去看了一下相关的函数。
关于swk:
http://www.cnblogs.com/ggjucheng/archive/2013/01/13/2858470.html
awk把文件逐行的读入,以空格为默认分隔符将每行切片,切开的部分再进行各种分析处理。
使用方法:
awk '{pattern + action}' {filenames}
上面的函数,第一句就是用awk来转化大小写。 表示分割的域, 0表示所有域,$1表示所有行的第一列。
sed是把当前行放入临时缓冲区,然后进行文本处理,可以用正则表达式,-e代表对文本进行处理,-f代表对文件进行处理。
sed -e "s/’/'/g"
上面的这个实现了文本的替换,把’替换成’ 。
发现这个网站把linux的命令行做了一个索引。
bzip2 -c -d
enwiki-latest-pages-articles.xml.bz2 | awk '{print tolower($0);}' | perl -e '
# Program to filter Wikipedia XML dumps to "clean" text consisting only of lowercase
# letters (a-z, converted from A-Z), and spaces (never consecutive)...
# All other characters are converted to spaces. Only text which normally appears.
# in the web browser is displayed. Tables are removed. Image captions are.
# preserved. Links are converted to normal text. Digits are spelled out.
# *** Modified to not spell digits or throw away non-ASCII characters ***
# Written by Matt Mahoney, June 10, 2006. This program is released to the public domain.
$/=">"; # input record separator
while (<>) {
if (/<text /) {$text=1;} # remove all but between <text> ... </text>
if (/#redirect/i) {$text=0;} # remove #REDIRECT
if ($text) {
# Remove any text not normally visible
if (/<\/text>/) {$text=0;}
s/<.*>//; # remove xml tags
s/&/&/g; # decode URL encoded chars
s/</</g;
s/>/>/g;
s/<ref[^<]*<\/ref>//g; # remove references <ref...> ... </ref>
s/<[^>]*>//g; # remove xhtml tags
s/\[http:[^] ]*/[/g; # remove normal url, preserve visible text
s/\|thumb//ig; # remove images links, preserve caption
s/\|left//ig;
s/\|right//ig;
s/\|\d+px//ig;
s/\[\[image:[^\[\]]*\|//ig;
s/\[\[category:([^|\]]*)[^]]*\]\]/[[$1]]/ig; # show categories without markup
s/\[\[[a-z\-]*:[^\]]*\]\]//g; # remove links to other languages
s/\[\[[^\|\]]*\|/[[/g; # remove wiki url, preserve visible text
s/{{[^}]*}}//g; # remove {{icons}} and {tables}
s/{[^}]*}//g;
s/\[//g; # remove [ and ]
s/\]//g;
s/&[^;]*;/ /g; # remove URL encoded chars
$_=" $_ ";
chop;
print $_;
}
}
' | normalize_text | awk '{if (NF>1) print;}' >> data.txt