本文信息本文由方法SEO顾问发表于2015-06-0815:50:03,共 1417 字,转载请注明:Python多线程采集网站title/description/keywords_【方法SEO顾问】,如果我网站的文章对你有所帮助的话,来百度口碑给个好评呗!
手中有一个20W的URL列表的TXT文件,想把这20W的页面的title/description/keywords都提取出来,shell只能单线程,而且通过curl方式
该代码默认是4线程的,如果需要更多的线程,只需要把test(l,4)中的4改为更大的数值就行了。
代码中需要用到BeautifulSoup这个库,关于怎么在windows系统下安装这个库,可以看看这个教程:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import requests
import threading
import Queue
import time
with open('url.txt') as f:
l = f.readlines()
def btdk(url):
try:
html = requests.get(url, timeout = 10).text
except:
html = '
%s'%urlsoup = BeautifulSoup(html.lower())
t = soup.title.text.encode('utf8','ignore')
try:
k = soup.find(attrs={"name":"keywords"})['content'].encode('utf8','ignore')
except:
k = ""
try:
d = soup.find(attrs={"name":"description"})['content'].encode('utf8','ignore')
except:
d = ""
return t,d,k
class MyThread(threading.Thread):
def __init__(self, queue, url):
threading.Thread.__init__(self)
self.queue = queue
self.url = url
def run(self):
while True:
url = self.queue.get()
t,k,d = btdk(url)
with open('tdk.txt', 'a+') as s:
line = url+'#'+t+'#'+'\n'
s.writelines(line)
self.queue.task_done()
def test(l, ts=4):
ll = [i.rstrip() for i in l]
for j in range(ts):
t = MyThread(queue,ll)
t.setDaemon(True)
t.start()
for url in ll:
queue.put(url)
queue.join()
if __name__ == '__main__':
queue = Queue.Queue()
start = time.time()
test(l,4)
end = time.time()
print '共耗时:%s秒' % (end - start)