一段简单的代码,用于抓取wiki百科数据,简单的多线程编程例子,很少占内存,线程数开大了后效率很高。
import sys, thread, threading, time;
import commands
finish_num = 0;
mutex = threading.Lock();
def extract_qid(id, num_of_thread):
try:
fin = open(sys.argv[1], "r");
fout = open(sys.argv[2] + ".part" + str(id), "w");
count = 0;
for line in fin:
try:
line = line.strip();
if count % num_of_thread != id:
count += 1;
continue;
count += 1;
_raw_query = line;
cmd = "wget \"zh.wikipedia.org/zh-hans/${query}\" -O \"fetch_wiki/tmp_search_${id}\""
cmd = cmd.replace("${query}", _raw_query).replace("${id}", str(id));
commands.getoutput(cmd);
tmp_fin = open("fetch_wiki/tmp_search_${id}".replace("${id}", str(id)), "r");
fout.write("zh.wikipedia.org/zh-hans/${query}\n".replace("${query}",_raw_query))
for tmp_line in tmp_fin:
fout.write(tmp_line)
tmp_fin.close();
commands.getoutput("rm -f \"fetch_wiki/tmp_search_${id}\"".replace("${id}", str(id)));
except:
continue;
fout.close();
fin.close();
global finish_num;
if mutex.acquire(1):
finish_num += 1;
mutex.release();
return True;
except Exception as e:
print e;
return False;
for i in range(0, int(sys.argv[3])):
thread.start_new_thread(extract_qid, (i, int(sys.argv[3])));
while finish_num != int(sys.argv[3]):
time.sleep(1);