https://github.com/Santostang/PythonScraping/
实际内容如下形式:
代码:
import requests
import time
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
print (file_list)
for eachone in file_list:
print (eachone)
link = eachone.split('\t')[1]
link = link.replace('\n','')
link_list.append(link)
start = time.time()
cnt = 0
index = 0
for eachone in link_list:
cnt = cnt + 1
try:
r = requests.get(eachone, headers= headers,timeout= 10)
index = index + 1
# print (cnt, r.status_code, eachone)
print("%d\t%s" % (index, eachone))
except Exception as e:
pass
# print(cnt, 'Error: ', e)
end = time.time()
print ('串行的总时间为:', end-start)
抓取 网站排名
网站排名:
http://www.alexa.cn/siterank/
import requests
from bs4 import BeautifulSoup
import time
num = 0
for i in range(1, 51):
r = requests.get('http://www.alexa.cn/siterank/' + str(i))
soup = BeautifulSoup(r.text, "lxml")
span_list = soup.find_all('span', class_='domain-link')
link_list = [(str(j + num), span_list[j].a['href']) for j in range(len(span_list))]
num = num + len(link_list)
output = "\n".join("%s\t%s" % tup for tup in link_list) + "\n"
print(num)
# C:\Users\Administrator\Desktop
with open('C:\\Users\\Administrator\\Desktop\\alexa.txt', 'a+', encoding='utf-8') as f:
f.write(output)
f.close
time.sleep(3)
多线程 threading
import threading
import time
class myThread(threading.Thread):
def __init__(self, name, delay):
threading.Thread.__init__(self)
self.name = name
self.delay = delay
def run(self):
print("Starting " + self.name)
print_time(self.name, self.delay)
print("Exiting " + self.name)
def print_time(threadName, delay):
counter = 0
while counter < 3:
time.sleep(delay)
print(threadName, time.ctime())
counter += 1
threads = []
# 创建新线程
thread1 = myThread("Thread-1", 1)
thread2 = myThread("Thread-2", 2)
# 开启新线程
thread1.start()
thread2.start()
# 添加线程到线程列表
threads.append(thread1)
threads.append(thread2)
# 等待所有线程完成
for t in threads:
t.join()
print("Exiting Main Thread")
多线程 抓取 网页 时间对比
import requests
import time
headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1]
link = link.replace('\n','')
link_list.append(link)
start = time.time()
for eachone in link_list:
try:
r = requests.get(eachone,headers= headers, timeout = 10)
print (r.status_code, eachone)
except Exception as e:
print('Error: ', e)
end = time.time()
print ('串行的总时间为:', end-start)
串行的总时间为: 560.3799998760223
代码:
import threading
import requests
import time
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1]
link = link.replace('\n', '')
link_list.append(link)
start = time.time()
class myThread(threading.Thread):
def __init__(self, name, link_range):
threading.Thread.__init__(self)
self.name = name
self.link_range = link_range
def run(self):
print("Starting " + self.name)
crawler(self.name, self.link_range)
print("Exiting " + self.name)
def crawler(threadName, link_range):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
for i in range(link_range[0], link_range[1] + 1):
try:
r = requests.get(link_list[i], headers=headers, timeout=20)
print(threadName, r.status_code, link_list[i])
except Exception as e:
print(threadName, 'Error: ', e)
thread_list = []
link_range_list = [(0, 200), (201, 400), (401, 600), (601, 800), (801, 888)]
# 创建新线程
for i in range(1, 6):
thread = myThread("Thread-" + str(i), link_range_list[i - 1])
thread.start()
thread_list.append(thread)
# 等待所有线程完成
for thread in thread_list:
thread.join()
end = time.time()
print('简单多线程爬虫的总时间为:', end - start)
print("Exiting Main Thread")
简单多线程爬虫的总时间为: 155.1600000858307
Exiting Main Thread
代码3:
import threading
import requests
import time
import queue as Queue
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1]
link = link.replace('\n', '')
link_list.append(link)
start = time.time()
class myThread(threading.Thread):
def __init__(self, name, q):
threading.Thread.__init__(self)
self.name = name
self.q = q
def run(self):
print("Starting " + self.name)
while True:
try:
crawler(self.name, self.q)
except:
break
print("Exiting " + self.name)
def crawler(threadName, q):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
url = q.get(timeout=2)
try:
r = requests.get(url, headers = headers,timeout=20)
print(threadName, r.status_code,url )
except Exception as e:
print(threadName, 'Error: ', e)
threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"]
workQueue = Queue.Queue(888)
threads = []
# 创建新线程
for tName in threadList:
thread = myThread(tName, workQueue)
thread.start()
threads.append(thread)
# 填充队列
for url in link_list:
workQueue.put(url)
# 等待所有线程完成
for t in threads:
t.join()
end = time.time()
print('简单多线程爬虫的总时间为:', end - start)
print("Exiting Main Thread")
简单多线程爬虫的总时间为: 114.97500014305115
多进程
from multiprocessing import Process, Queue
import time
import requests
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1]
link = link.replace('\n','')
link_list.append(link)
start = time.time()
class MyProcess(Process):
def __init__(self, q):
Process.__init__(self)
self.q = q
def run(self):
print ("Starting " , self.pid)
while not self.q.empty():
crawler(self.q)
print ("Exiting " , self.pid)
def crawler(q):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
url = q.get(timeout=2)
try:
r = requests.get(url,headers = headers, timeout=20)
print (q.qsize(), r.status_code, url)
except Exception as e:
print (q.qsize(), url, 'Error: ', e)
if __name__ == '__main__':
ProcessNames = ["Process-1", "Process-2", "Process-3"]
workQueue = Queue(888)
# 填充队列
for url in link_list:
workQueue.put(url)
for i in range(0, 3):
p = MyProcess(workQueue)
p.daemon = True
p.start()
p.join()
end = time.time()
print ('Process + Queue多进程爬虫的总时间为:', end-start)
print ('Main process Ended!')
Process + Queue多进程爬虫的总时间为: 522.6050000190735
from multiprocessing import Pool, Manager
import time
import requests
link_list = []
with open('alexa.txt', 'r') as file:
file_list = file.readlines()
for eachone in file_list:
link = eachone.split('\t')[1]
link = link.replace('\n','')
link_list.append(link)
start = time.time()
def crawler(q, index):
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
'Connection': 'close'}
Process_id = 'Process-' + str(index)
while not q.empty():
url = q.get(timeout=2)
try:
r = requests.get(url,headers=headers, timeout=20)
print (Process_id, q.qsize(), r.status_code, url)
except Exception as e:
print (Process_id, q.qsize(), url, 'Error: ', e)
if __name__ == '__main__':
manager = Manager()
workQueue = manager.Queue(1000)
# 填充队列
for url in link_list:
workQueue.put(url)
pool = Pool(processes=3)
for i in range(4):
pool.apply_async(crawler, args=(workQueue, i))
print ("Started processes")
pool.close()
pool.join()
end = time.time()
print ('Pool + Queue多进程爬虫的总时间为:', end-start)
print ('Main process Ended!')
Pool + Queue多进程爬虫的总时间为: 251.4449999332428
Main process Ended!
多 协程 爬虫
(稍后补充)