PYTHON网络爬虫从入门到实践第 7章

最新推荐文章于 2021-07-07 16:38:44 发布

wowocpp

最新推荐文章于 2021-07-07 16:38:44 发布

阅读量243

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/wowocpp/article/details/100090298

版权

python 专栏收录该内容

313 篇文章 14 订阅

订阅专栏

在这里插入图片描述

https://github.com/Santostang/PythonScraping/

实际内容如下形式：
在这里插入图片描述

代码：

import requests
import time

headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
           'Connection': 'close'}

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    print (file_list)
    for eachone in file_list:
        print (eachone)
        link = eachone.split('\t')[1]
        link = link.replace('\n','')
        link_list.append(link)

start = time.time()
cnt = 0
index = 0
for eachone in link_list:
    cnt = cnt + 1
    try:
        r = requests.get(eachone, headers= headers,timeout= 10)
        index = index + 1
        # print (cnt, r.status_code, eachone)
        print("%d\t%s" % (index, eachone))
    except Exception as e:
        pass
        # print(cnt, 'Error: ', e)
end = time.time()
print ('串行的总时间为：', end-start)

抓取网站排名

网站排名:
http://www.alexa.cn/siterank/

import requests
from bs4 import BeautifulSoup
import time

num = 0
for i in range(1, 51):
    r = requests.get('http://www.alexa.cn/siterank/' + str(i))
    soup = BeautifulSoup(r.text, "lxml")
    span_list = soup.find_all('span', class_='domain-link')
    link_list = [(str(j + num), span_list[j].a['href']) for j in range(len(span_list))]
    num = num + len(link_list)

    output = "\n".join("%s\t%s" % tup for tup in link_list) + "\n"
    print(num)
    # C:\Users\Administrator\Desktop
    with open('C:\\Users\\Administrator\\Desktop\\alexa.txt', 'a+', encoding='utf-8') as f:
        f.write(output)
        f.close
    time.sleep(3)

多线程 threading

import threading
import time


class myThread(threading.Thread):
    def __init__(self, name, delay):
        threading.Thread.__init__(self)
        self.name = name
        self.delay = delay

    def run(self):
        print("Starting " + self.name)
        print_time(self.name, self.delay)
        print("Exiting " + self.name)


def print_time(threadName, delay):
    counter = 0
    while counter < 3:
        time.sleep(delay)
        print(threadName, time.ctime())
        counter += 1


threads = []

# 创建新线程
thread1 = myThread("Thread-1", 1)
thread2 = myThread("Thread-2", 2)

# 开启新线程
thread1.start()
thread2.start()

# 添加线程到线程列表
threads.append(thread1)
threads.append(thread2)

# 等待所有线程完成
for t in threads:
    t.join()

print("Exiting Main Thread")

多线程抓取网页时间对比

import requests
import time

headers = {'User-Agent' : 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
           'Connection': 'close'}

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1]
        link = link.replace('\n','')
        link_list.append(link)

start = time.time()
for eachone in link_list:
    try:
        r = requests.get(eachone,headers= headers, timeout = 10)
        print (r.status_code, eachone)
    except Exception as e:
        print('Error: ', e)
end = time.time()
print ('串行的总时间为：', end-start)

串行的总时间为： 560.3799998760223
代码:

import threading
import requests
import time

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1]
        link = link.replace('\n', '')
        link_list.append(link)

start = time.time()


class myThread(threading.Thread):
    def __init__(self, name, link_range):
        threading.Thread.__init__(self)
        self.name = name
        self.link_range = link_range

    def run(self):
        print("Starting " + self.name)
        crawler(self.name, self.link_range)
        print("Exiting " + self.name)


def crawler(threadName, link_range):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
               'Connection': 'close'}
    for i in range(link_range[0], link_range[1] + 1):
        try:
            r = requests.get(link_list[i], headers=headers, timeout=20)
            print(threadName, r.status_code, link_list[i])
        except Exception as e:
            print(threadName, 'Error: ', e)


thread_list = []
link_range_list = [(0, 200), (201, 400), (401, 600), (601, 800), (801, 888)]

# 创建新线程
for i in range(1, 6):
    thread = myThread("Thread-" + str(i), link_range_list[i - 1])
    thread.start()
    thread_list.append(thread)

# 等待所有线程完成
for thread in thread_list:
    thread.join()

end = time.time()
print('简单多线程爬虫的总时间为：', end - start)
print("Exiting Main Thread")

简单多线程爬虫的总时间为： 155.1600000858307
Exiting Main Thread

代码3：

import threading
import requests
import time
import queue as Queue

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1]
        link = link.replace('\n', '')
        link_list.append(link)

start = time.time()


class myThread(threading.Thread):
    def __init__(self, name, q):
        threading.Thread.__init__(self)
        self.name = name
        self.q = q

    def run(self):
        print("Starting " + self.name)
        while True:
            try:
                crawler(self.name, self.q)
            except:
                break
        print("Exiting " + self.name)


def crawler(threadName, q):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
               'Connection': 'close'}
    url = q.get(timeout=2)
    try:
        r = requests.get(url, headers = headers,timeout=20)
        print(threadName, r.status_code,url )
    except Exception as e:
        print(threadName, 'Error: ', e)


threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5"]
workQueue = Queue.Queue(888)
threads = []

# 创建新线程
for tName in threadList:
    thread = myThread(tName, workQueue)
    thread.start()
    threads.append(thread)

# 填充队列
for url in link_list:
    workQueue.put(url)

# 等待所有线程完成
for t in threads:
    t.join()

end = time.time()
print('简单多线程爬虫的总时间为：', end - start)
print("Exiting Main Thread")

简单多线程爬虫的总时间为： 114.97500014305115

多进程

from multiprocessing import Process, Queue
import time
import requests

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1]
        link = link.replace('\n','')
        link_list.append(link)

start = time.time()
class MyProcess(Process):
    def __init__(self, q):
        Process.__init__(self)
        self.q = q

    def run(self):
        print ("Starting " , self.pid)
        while not self.q.empty():
            crawler(self.q)
        print ("Exiting " , self.pid)

def crawler(q):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
               'Connection': 'close'}
    url = q.get(timeout=2)
    try:
        r = requests.get(url,headers = headers, timeout=20)
        print (q.qsize(), r.status_code, url)
    except Exception as e:
        print (q.qsize(), url, 'Error: ', e)

if __name__ == '__main__':
    ProcessNames = ["Process-1", "Process-2", "Process-3"]
    workQueue = Queue(888)

    # 填充队列
    for url in link_list:
        workQueue.put(url)

    for i in range(0, 3):
        p = MyProcess(workQueue)
        p.daemon = True
        p.start()
        p.join()

    end = time.time()
    print ('Process + Queue多进程爬虫的总时间为：', end-start)
    print ('Main process Ended!')

Process + Queue多进程爬虫的总时间为： 522.6050000190735

from multiprocessing import Pool, Manager
import time
import requests

link_list = []
with open('alexa.txt', 'r') as file:
    file_list = file.readlines()
    for eachone in file_list:
        link = eachone.split('\t')[1]
        link = link.replace('\n','')
        link_list.append(link)

start = time.time()
def crawler(q, index):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6',
               'Connection': 'close'}
    Process_id = 'Process-' + str(index)
    while not q.empty():
        url = q.get(timeout=2)
        try:
            r = requests.get(url,headers=headers, timeout=20)
            print (Process_id, q.qsize(), r.status_code, url)
        except Exception as e:
            print (Process_id, q.qsize(), url, 'Error: ', e)


if __name__ == '__main__':
    manager = Manager()
    workQueue = manager.Queue(1000)

    # 填充队列
    for url in link_list:
        workQueue.put(url)

    pool = Pool(processes=3)
    for i in range(4):
        pool.apply_async(crawler, args=(workQueue, i))

    print ("Started processes")
    pool.close()
    pool.join()

    end = time.time()
    print ('Pool + Queue多进程爬虫的总时间为：', end-start)
    print ('Main process Ended!')

Pool + Queue多进程爬虫的总时间为： 251.4449999332428
Main process Ended!