免责声明:本文仅做分享...
房信息-->数据库mysql
import requests
from lxml import etree
import pymysql
class LianjiaSpider:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36'
}
def __init__(self,url, database_config):
self.city_url = url
self.database_config = database_config
# 获取城市和url的映射
self.city_dict = self.get_city_dict()
self.conn = 0
self.cur = 0
def get_city_dict(self):
res = requests.get(self.city_url, headers=self.headers)
tree = etree.HTML(res.text)
lis = tree.xpath('//ul[@class="city_list_ul"]/li/div[@class="city_list"]//li')
city_dict = {}
for li in lis:
city_name = li.xpath('./a/text()')[0]
city_url = li.xpath('./a/@href')[0]
city_dict[city_name] = city_url
# 调用方法返回字典
return city_dict
def send_request(self,url):
res = requests.get(url, headers=self.headers)
return res.text
def parse_data(self, html):
tree = etree.HTML(html)
divs = tree.xpath('//div[@class="resblock-desc-wrapper"]')
data = []
for div in divs:
title = div.xpath('./div/a[@class="name "]/text()')[0]
price = div.xpath('.//div[@class="main-price"]/span/text()')
price = ''.join(price)
location = div.xpath('./div[@class="resblock-location"]//text()')
location = ''.join(location).replace('\n ', '').replace(' ', '')
room = div.xpath('./a[@class="resblock-room"]//text()')
room = ''.join(room).replace('\n ', '').replace(' ', '')
area = div.xpath('./div[@class="resblock-area"]//text()')
area = ''.join(area).replace('\n ', '').replace(' ', '')
data.append((title, price, location, room, area))
# 把解析之后的数据存到列表中进行返回 [(),(),()]
return data
def save_data(self, data):
# 获取连接,参数是解压了字典,实际就是关键字传参
self.conn = pymysql.connect(**self.database_config)
self.cur = self.conn.cursor()
try:
# data接收到的是列表,所以要进行取值,循环中item接收到的是元组数据
for item in data:
insert_sql = f'INSERT INTO lj VALUES (null,"{item[0]}","{item[1]}","{item[2]}","{item[3]}","{item[4]}");'
self.cur.execute(insert_sql)
self.conn.commit()
except Exception as e:
if self.conn != 0:
self.conn.rollback()
print(e)
finally:
if self.conn!=0 and self.cur!=0:
self.cur.close()
self.conn.close()
def fetch_data(self, city_name):
if city_name in self.city_dict:
city_url = self.city_dict[city_name]
for page in range(1, 6):
c_url = city_url + f'loupan/pg{page}'
html = self.send_request(c_url) # 调用发起请求方法
data = self.parse_data(html) # 调用解析数据方法
self.save_data(data) # 调用保存数据方法
print(f'当前是第{page}页')
else:
print('没有这个城市数据')
if __name__ == "__main__":
database_config = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'root',
'database': 'tmpp'
}
url = 'https://www.lianjia.com/city/'
scraper = LianjiaSpider(url,database_config)
city_name = input('请输入你要查询的城市名:')
scraper.fetch_data(city_name)