为文本建立索引是文本信息处理的一个重要的任务,给定一个由英文单词构成的文件,为文件中所有单词建立索引,记录每个单词出现的行号和每行出现的次数,并将索引存入一个文件。
-module(text_index).
-compile(export_all).
-import(re, [run/2,replace/4]).
-record(data, {word, line = []}).
-define(DBNAME, dataDB).
%%% 数据库操作
createDB() ->
ets:new(?DBNAME, [public, named_table, set, {keypos, #data.word}]).
closeDB() ->
ets:delete(?DBNAME).
start(File) ->
case file:open(File, read) of
{ok, IoDevice} ->
Content = readAllText(IoDevice),
Lines = string:tokens(Content, "\r\n"), %讲文件分解每行
processLine(Lines);
_ ->
io:format("Open the file failed!")
end.
%%%将文件的内容全部读入到内存中
readAllText(IoDevice) ->
readAllText(IoDevice, []).
readAllText(IoDevice, Content)->
case file:read(IoDevice, 1024) of
{ok, Text} ->
readAllText(IoDevice, [Content | Text]);
eof ->
file:close(IoDevice),
lists:flatten(Content);
{error, Reason}->
io:format("Read file failed! The reason is:~p~n", [Reason]),
file:close(IoDevice),
{error, Reason}
end.
-define(PATTERN, "[a-zA-Z\.]+").
%%%对文件的每一行进行处理,提取出每一个单词
processLine(Lines) ->
processLine(Lines, 1).
processLine([H | T], LineNumber) ->
case re:run(H, ?PATTERN, [global]) of
{match, MatchItem} ->
Words = splitWords(H, lists:flatten(MatchItem)),
writeDB(Words, LineNumber);
nomatch ->
ok
end,
processLine(T, LineNumber + 1);
processLine([], _LineNumber) ->
ok.
splitWords(Line, MatchItem) ->
splitWords(Line, MatchItem, []).
splitWords(Line, [{Start, Length} | T], Words)->
Word = string:substr(Line, Start+1, Length),
splitWords(Line, T, [Word | Words]);
splitWords(_Line, [], Words)->
lists:reverse(Words).
%%% 将数据额写入到ets数据库中
writeDB([Key | T], LineNumber)->
case ets:match_object(dataDB, #data{word = Key, _ = '_'}) of
[#data{word = Key, line = Value}] ->
ets:insert(dataDB, #data{word = Key, line = updateData(Value, LineNumber)}),
writeDB(T, LineNumber);
_ ->
ets:insert(dataDB, #data{word = Key, line = [{LineNumber, 1}]}),
writeDB(T, LineNumber)
end;
writeDB([], _) ->
ok.
%%%对数据进行更新,此方法不太好。
updateData(LineValue, LineNumber) ->
updateData(LineValue, LineNumber, [], false).
updateData([{LineNumber, Times} | Tail], LineNumber, Res, false) ->
updateData(Tail, LineNumber, [{LineNumber, Times + 1} | Res], true);
updateData([], LineNumber, Res, false) ->
lists:keysort(1, [{LineNumber, 1} | Res]);
updateData([], _LineNumber, Res, true)->
lists:keysort(1, Res);
updateData([H | T], LineNumber, Res, Tmp)->
updateData(T, LineNumber, [H | Res], Tmp).
main() ->
FileName = "C:\\Users\\elqstux\\Desktop\\wy.py",
createDB(),
start(FileName),
closeDB().
updateData/2 的简洁版本,利用lists模块的内建函数。
updateData(LineValue, LineNumber)->
case lists:keysearch(LineNumber, 1, LineValue) of
{value, {LineNumber, Times}} ->
lists:keyreplace(LineNumber, 1, LineValue, {LineNumber, Times + 1});
false ->
[{LineNumber, 1} | LineValue]
end.