2020.12.3-CSDN博客

本文链接：https://blog.csdn.net/weixin_51713652/article/details/121695446

import threading
import requests
from lxml import etree
from queue import Queue, LifoQueue, PriorityQueue
import time

CRAWL_EXIT=False
PARSE_EXIT=False

class ThreadCrawl(threading.Thread):
def init(self, threadName, pageQueue, dataQueue):
threading.Thread.init(self)
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue

def run(self):
    print("启动"+self.threadName)
    while not CRAWL_EXIT:
        print("准备获取网页- - -")
        pagenum=self.pageQueue.get(False)
        content=self.getpage(pagenum)
        self.dataQueue.put(content)
        print("成功获取- - - ")
    print("结束"+self.threadName)

def getpage(self, pagenum):
    url = "http://www.qiushibaike.com/text/page/" + str(pagenum) + "/"
    headers = {
        'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
        'Connection': 'keep-alive'
    }
    try:
        res = requests.get(url, headers=headers)
    except Exception as e:
        print(e)
    html = res.text
    #print(html)
    return html

class ThreadParse(threading.Thread):
def init(self, threadName, dataQueue):
threading.Thread.init(self)
self.threadName = threadName
self.dataQueue = dataQueue

def run(self):
    print("启动" + self.threadName)
    while not PARSE_EXIT:
        print("准备解析数据- - -")
        try:
            html=self.dataQueue.get(False)
            self.parsedata(html)
        except:
            pass
        print("成功得到数据- - - ")

    print("结束" + self.threadName)

def parsedata(self,html):
    rootnode = etree.HTML(html)
    print("parsedata....")
    # write_to_file("qiushibaike.html",html)
    node_list = rootnode.xpath('//div[contains(@id,"qiushi_tag")]')
    for node in node_list:
        try:
            username = node.xpath('./div')[0].xpath('.//h2')[0].text
            content = node.xpath('.//div[@class="content"]/span')[0].text
            like = node.xpath('.//i')[0].text
            comments = node.xpath('.//i')[1].text

            items = {
                "username": username,
                "content": content,
                "like": like,
                "comments": comments
            }
            print(items)
        except Exception as e:
            print(e)
            pass

def run_multithread():
len=20
pageQueue=Queue(len)
for i in range(1, len+1):
pageQueue.put(i)

dataQueue=Queue()


#启动三个网页爬取线程
crawllist=['采集线程001','采集线程002','采集线程003']
threadCrawls=[]
for threadname in crawllist:
    thread = ThreadCrawl(threadname, pageQueue, dataQueue)
    thread.start()
    threadCrawls.append(thread)

#等待线程执行完成
while not pageQueue.empty():
    pass

global CRAWL_EXIT
CRAWL_EXIT=True

for thread in threadCrawls:
    thread.join()


# 启动三个数据解析线程
parserlist = ['解析线程001', '解析线程002', '解析线程003']
threadParses = []
for threadname in parserlist:
    thread = ThreadParse(threadname, dataQueue)
    thread.start()
    threadParses.append(thread)


#等待线程执行完成
while not dataQueue.empty():
    pass

global PARSE_EXIT
PARSE_EXIT=True

for thread in threadParses:
    thread.join()

if name == ‘main’:

starttime = time.time()
run_multithread()
print(time.time() - starttime)

import requests
from lxml import etree
import json
import time

def write_to_file(filename, html):
f=open(filename,‘w’,encoding=‘utf8’);
f.write(html);
f.close();

def getpage(pagenum):
url=“http://www.qiushibaike.com/text/page/”+str(pagenum)+"/"
headers={
‘User-Agent’: ‘Mozilla/5.0(Linux;Android 6.0;Nexus 5 Build / MRA58N) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 95.0.4638.54 MobileSafari / 537.36 Edg / 95.0.1020.40’,
‘Connection’: ‘keep-alive’
}
try:
res=requests.get(url, headers=headers)
except Exception as e:
print(e)
html=res.text
print(html)
write_to_file(“qiushibaike.html”,html)
parsedata(html)

def parsedata(html):
rootnoda=etree.HTML(html)
print(“parsedata…”)

node_list=rootnoda.xpath('//div[contains(@id,"qiushu_tag")]')
for node in node_list:
    try:
        username=node.xpath('./div')[0].xpath('.//h2')[0].text
        content=node.xpath('.//div[@class="content"]/span')[0].text
        like=node.xpath('.//i')[0].text
        comments=node.xpath('.//i')[1].text


        items={
        "username":username,
        "content":content,
        "like":like,
        "comments":comments
        }
        print(items)
    except Exception as e:
        print(e)
        pass

if name == ‘main’:
getpage(2)

from queue import Queue, LifoQueue, PriorityQueue

def start_line(mark):
print(“"+mark+"____”)

def case_queue():
start_line(‘normal queue’)

queObj=Queue()
for i in range(4):
    queObj.put(i)
while not queObj.empty():
    print(queObj.get())

def case_lifoqueue():
start_line(‘lifo queue’)

queObj = LifoQueue()
for i in range(4):
    queObj.put(i)
while not queObj.empty():
    print(queObj.get())

class Job(object):
def init(self, level, description):
self.level=level
self.description=description
return

def __lt__(self, other):
    return self.level<other.level

def case_priorityqueue():
start_line(‘priority queue’)
queObj = PriorityQueue()
queObj.put(Job(5,‘中级别工作’))
queObj.put(Job(10,‘低级别工作’))
queObj.put(Job(1,‘紧急工作’))

while not queObj.empty():
    print(queObj.get().description)

if name == ‘main’:
case_queue()
case_lifoqueue()
case_priorityqueue()