import threading
import requests
from lxml import etree
from queue import Queue, LifoQueue, PriorityQueue
import time
CRAWL_EXIT=False
PARSE_EXIT=False
class ThreadCrawl(threading.Thread):
def init(self, threadName, pageQueue, dataQueue):
threading.Thread.init(self)
self.threadName = threadName
self.pageQueue = pageQueue
self.dataQueue = dataQueue
def run(self):
print("启动"+self.threadName)
while not CRAWL_EXIT:
print("准备获取网页- - -")
pagenum=self.pageQueue.get(False)
content=self.getpage(pagenum)
self.dataQueue.put(content)
print("成功获取- - - ")
print("结束"+self.threadName)
def getpage(self, pagenum):
url = "http://www.qiushibaike.com/text/page/" + str(pagenum) + "/"
headers = {
'User-Agent': 'User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
'Connection': 'keep-alive'
}
try:
res = requests.get(url, headers=headers)
except Exception as e:
print(e)
html = res.text
#print(html)
return html
class ThreadParse(threading.Thread):
def init(self, threadName, dataQueue):
threading.Thread.init(self)
self.threadName = threadName
self.dataQueue = dataQueue
def run(self):
print("启动" + self.threadName)
while not PARSE_EXIT:
print("准备解析数据- - -")
try:
html=self.dataQueue.get(False)
self.parsedata(html)
except:
pass
print("成功得到数据- - - ")
print("结束" + self.threadName)
def parsedata(self,html):
rootnode = etree.HTML(html)
print("parsedata....")
# write_to_file("qiushibaike.html",html)
node_list = rootnode.xpath('//div[contains(@id,"qiushi_tag")]')
for node in node_list:
try:
username = node.xpath('./div')[0].xpath('.//h2')[0].text
content = node.xpath('.//div[@class="content"]/span')[0].text
like = node.xpath('.//i')[0].text
comments = node.xpath('.//i')[1].text
items = {
"username": username,
"content": content,
"like": like,
"comments": comments
}
print(items)
except Exception as e:
print(e)
pass
def run_multithread():
len=20
pageQueue=Queue(len)
for i in range(1, len+1):
pageQueue.put(i)
dataQueue=Queue()
#启动三个网页爬取线程
crawllist=['采集线程001','采集线程002','采集线程003']
threadCrawls=[]
for threadname in crawllist:
thread = ThreadCrawl(threadname, pageQueue, dataQueue)
thread.start()
threadCrawls.append(thread)
#等待线程执行完成
while not pageQueue.empty():
pass
global CRAWL_EXIT
CRAWL_EXIT=True
for thread in threadCrawls:
thread.join()
# 启动三个数据解析线程
parserlist = ['解析线程001', '解析线程002', '解析线程003']
threadParses = []
for threadname in parserlist:
thread = ThreadParse(threadname, dataQueue)
thread.start()
threadParses.append(thread)
#等待线程执行完成
while not dataQueue.empty():
pass
global PARSE_EXIT
PARSE_EXIT=True
for thread in threadParses:
thread.join()
if name == ‘main’:
starttime = time.time()
run_multithread()
print(time.time() - starttime)
import requests
from lxml import etree
import json
import time
def write_to_file(filename, html):
f=open(filename,‘w’,encoding=‘utf8’);
f.write(html);
f.close();
def getpage(pagenum):
url=“http://www.qiushibaike.com/text/page/”+str(pagenum)+"/"
headers={
‘User-Agent’: ‘Mozilla/5.0(Linux;Android 6.0;Nexus 5 Build / MRA58N) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 95.0.4638.54 MobileSafari / 537.36 Edg / 95.0.1020.40’,
‘Connection’: ‘keep-alive’
}
try:
res=requests.get(url, headers=headers)
except Exception as e:
print(e)
html=res.text
print(html)
write_to_file(“qiushibaike.html”,html)
parsedata(html)
def parsedata(html):
rootnoda=etree.HTML(html)
print(“parsedata…”)
node_list=rootnoda.xpath('//div[contains(@id,"qiushu_tag")]')
for node in node_list:
try:
username=node.xpath('./div')[0].xpath('.//h2')[0].text
content=node.xpath('.//div[@class="content"]/span')[0].text
like=node.xpath('.//i')[0].text
comments=node.xpath('.//i')[1].text
items={
"username":username,
"content":content,
"like":like,
"comments":comments
}
print(items)
except Exception as e:
print(e)
pass
if name == ‘main’:
getpage(2)
from queue import Queue, LifoQueue, PriorityQueue
def start_line(mark):
print(“"+mark+"____”)
def case_queue():
start_line(‘normal queue’)
queObj=Queue()
for i in range(4):
queObj.put(i)
while not queObj.empty():
print(queObj.get())
def case_lifoqueue():
start_line(‘lifo queue’)
queObj = LifoQueue()
for i in range(4):
queObj.put(i)
while not queObj.empty():
print(queObj.get())
class Job(object):
def init(self, level, description):
self.level=level
self.description=description
return
def __lt__(self, other):
return self.level<other.level
def case_priorityqueue():
start_line(‘priority queue’)
queObj = PriorityQueue()
queObj.put(Job(5,‘中级别工作’))
queObj.put(Job(10,‘低级别工作’))
queObj.put(Job(1,‘紧急工作’))
while not queObj.empty():
print(queObj.get().description)
if name == ‘main’:
case_queue()
case_lifoqueue()
case_priorityqueue()