import threading
import requests
from time import sleep
from bs4 import BeautifulSoup
import csv
from queue import Queue
# 封装一个多线程爬虫类
class CrawlThread(threading.Thread):
def __init__(self,page_queque,data_queque,crawl_name):
super().__init__()
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.80 Safari/537.36'}
self.page_queque = page_queque
self.crawl_name = crawl_name
self.data_queque = data_queque
def run(self):
# 在run方法中发起请求
# 1、从页面队列中出队一个页面编号 2、拼接url 3、发起请求 4、将响应数据加入到响应数据队列中
while True:
# 出队一个页面编号
page = self.page_queque.get()
# 拼接url
url = "http://bj.maitian.cn/zfall/PG" + str(page)
print("当前线程为:%s,正在请求页面:%s"%(self.crawl_name,url))
# 发起请求
res = requests.get(url=url,headers=self.headers)
# 将数据放入数据队列中
self.data_queque.put(res.text)
print(self.data_queque.qsize())
# 当所有的页面取空了,将线程结束
if self.page_queque.empty():
break
sleep(5)
count = 9
# 封装一个多线程的类,用于解析与存储
class ParseTread(threading.Thread):
def __init__(self,data_queue,parse_name,lock,fp):
super().__init__()
print("线程:",parse_name)
self.lock = lock
self.data_queue = data_queue
self.parse_name = parse_name
self.fp = fp
def run(self):
# 1、将数据从数据队列中取出 2、解析并存储
while True:
global count
if count == 0:
self.fp.close()
break
if self.data_queue.empty():
continue
# 取出内容
html = self.data_queue.get()
# 解析
self.parse_content(html)
count -= 1
sleep(1)
# 封装一个成员方法,用于解析
def parse_content(self,html):
soup = BeautifulSoup(html,"lxml")
# 找到所有的房源
house_list = soup.select("li.clearfix")
for house in house_list:
item = {}
item["title"] = house.select("h1")[0].get_text()
item["price"] = house.select(".the_price span")[0].get_text()
item["address"] = house.select(".house_hot span")[0].get_text()[1]
item["area"] = house.select(".list_title > p span")[0].get_text()[0]
print("当前线程为:%s,正在存储"%self.parse_name)
self.write_to_csv(item)
def write_to_csv(self,item):
with self.lock:
row = []
for k,v in item.items():
row.append(v)
writer = csv.writer(self.fp)
writer.writerow(row)
def main():
# 创建一个队列,用于保存所有的页面
page_queue = Queue()
# 将要爬取的页面的序号入队
for i in range(1,10):
page_queue.put(i)
# 定义一个队列,用存储每个页面的数据
data_queue = Queue()
# 创建三个线程用于下载
# 创建一个列表,用于管理三个线程
crawl_thread_list = []
# 定义一个列表,设置爬虫的名字
crawl_name_list = ["下载线程1","下载线程2","下载线程3"]
for crawl_name in crawl_name_list:
t = CrawlThread(crawl_name=crawl_name,data_queque=data_queue,page_queque=page_queue)
t.start()
# 将爬虫追加到管理列表中
crawl_thread_list.append(t)
# 打开一个资源文件
fp = open("maitian.csv","a+")
# 写文件头
w = csv.writer(fp)
w.writerow(["title","price","address","area"])
# 创建一个资源锁
lock = threading.Lock()
# 创建三个线程,用于写入数据
parse_thread_list = []
parse_name_list = ["解析线程1","解析线程2","解析线程3"]
for parse_name in parse_name_list:
t = ParseTread(lock=lock,data_queue=data_queue,parse_name=parse_name,fp=fp)
t.start()
parse_thread_list.append(t)
# 将三个下载线程同步
for crawl in crawl_thread_list:
crawl.join()
if __name__ == '__main__':
main()