话不多说直接上代码
这是将数据导入到MySQL数据库中的代码
import mysql.connector
class HandleMySQL(object):
def __init__(self):
# user和password要根据自己电脑设置的去写,
# 用这段代码之前一定要先创建一个数据库,名为crawl_beike
self.connector = mysql.connector.connect(host='localhost', port=3306, user='root', password='123456',
database='crawl_beike')
self.cursor = self.connector.cursor()
# 插入数据
def insert_one(self, info):
# 需要在数据库中先创建一个表house_info
sql = "INSERT INTO house_info VALUES (%s,%s,%s,%s,%s)"
data = (info['标题'], info['价格'], info['面积'], info['布局'], info['地区'])
self.cursor.execute(sql, data)
self.connector.commit()
# 关闭数据库连接
def close_connector(self):
self.cursor.close()
self.connector.close()
# 实例化
mysql = HandleMySQL()
这个是爬虫代码
import requests
import re
from lxml import etree
import multiprocessing
from concurrent.futures import ThreadPoolExecutor
# 导入上面写的存储到MySQL的模块(文件目录命名为crawl_beike,写的存储的py文件命名为handle_MySQL)
from crawl_beike.handle_MySQL import mysql
# 定义一个类
class CrawlBeike(object):
def __init__(self):
# 第一个请求的url,用来获取深圳各区district的url(罗湖区,福田区,龙岗区,南山区...
self.first_url = 'https://sz.zu.ke.com/zufang/'
# 创建一个地区area_空列表,用来存储从district的url中获取到的小区域的url(南山区→科技园,高新区,西丽....
self.area_url = []
# 设置请求头
self.header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3902.4 Safari/537.36'
}
# 用来存储构造的每个area的页码url
self.page_url = []
# 请求url方法
def request(self, url):
response = requests.get(url=url, headers=self.header)
return response.text
# 获取和构造详细地区url
def get_area_url(self):
# 调用request方法
response = self.request(self.first_url)
html = etree.HTML(response)
# 获取构造大区district的url
district_url = []
all_district = html.xpath('//ul[@data-target="area"]/li[@class="filter__item--level2 "]/a/@href')
# print(all_area)
for district in all_district:
district_url.append('https://sz.zu.ke.com' + district)
# 在大区url中获取和构造详细地区area的url
for url in district_url:
response = self.request(url)
html = etree.HTML(response)
all_area = html.xpath('//li[@class="filter__item--level3 "]/a/@href')
# 存储构造的url
for area in all_area:
self.area_url.append('https://sz.zu.ke.com' + area + 'pg')
# 构造页码url
def get_page_url(self, url):
page = 1
while True:
p_url = url + str(page)
response = self.request(p_url)
html = etree.HTML(response)
num = html.xpath('//span[@class="content__title--hl"]/text()')[0]
if num != '0':
print(p_url)
self.page_url.append(p_url)
page += 1
else:
break
# 使用XPath获取所需信息
def get_info(self, url):
response = self.request(url)
html = etree.HTML(response)
house_info = []
all_div = html.xpath('//div[@class="content__list--item"]')
pattern = r'([0-9-]*?㎡)[\d\D]*?(\d室\d厅\d卫)'
for item in all_div:
try:
info = {}
info['标题'] = str(item.xpath('.//p[@class="content__list--item--title twoline"]/a/text()')[0].strip())
info['价格'] = str(item.xpath('.//em/text()')[0])
detail = str(''.join(item.xpath('.//p[@class="content__list--item--des"]/text()')))
info['面积'] = str(re.search(pattern, detail).group(1))
info['布局'] = str(re.search(pattern, detail).group(2))
info['地区'] = str(html.xpath('//li[@class="filter__item--level3 strong"]/a/text()')[0])
house_info.append(info)
mysql.insert_one(info)
except Exception as e:
print(e)
print(house_info)
def run(self):
# 使用多线程获取每个地方的页码url
self.get_area_url()
executor = ThreadPoolExecutor()
executor.map(self.get_page_url, self.area_url)
executor.shutdown()
# 使用多进程获取每个页码url里房子信息
pool = multiprocessing.Pool(20)
for url in self.page_url:
pool.apply_async(self.get_info, args=(url,))
mysql.close_connector()
pool.close()
pool.join()
if __name__ == '__main__':
beike = CrawlBeike()
beike.run()
这是爬取的数据
导入到数据库的效果图