import urllib.request
import urllib.parse
from bs4 import BeautifulSoup
import json
class LianJiaSpider(object):
def __init__(self,url,start,end):
self.url = url
self.start = start
self.end = end
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"}
# 定义方法-->处理url
def handle_url(self,page):
# url = "https://sz.lianjia.com/ershoufang/pg2/"
# url = "https://sz.lianjia.com/ershoufang/"
if page == 1:
url = "https://sz.lianjia.com/ershoufang/"
else:
url = self.url + str(page) + "/"
print(url)
req = urllib.request.Request(url=url,headers=self.headers)
return req
# 定义方法-->用于下载
def download(self,request):
res = urllib.request.urlopen(request)
soup = BeautifulSoup(res.read(),"lxml")
contents = soup.select(".content .sellListContent li.clear")
# print(contents)
items_list = []
for content in contents:
item = {}
# print(content)
title = content.select(".title a")[0].get_text()
# print("标题:"+title)
houseInfo = content.select(".houseInfo")[0].get_text()
# print("房屋信息:"+houseInfo)
positionInfo = content.select(".positionInfo")[0].get_text()
# print("楼盘信息:"+positionInfo)
followInfo = content.select(".followInfo")[0].get_text()
# print("关注情况:"+followInfo)
totalPrice = content.select(".priceInfo .totalPrice")[0].get_text()
# print("总价:"+totalPrice)
unitPrice = content.select(".priceInfo .unitPrice")[0].get_text()
# print("单价:"+unitPrice)
src_url = content.select(".lj-lazy")[0]["data-original"]
# print("图片地址:"+src_url)
item["标题"] = title
item["房屋信息"] = houseInfo
item["楼盘信息"] = positionInfo
item["关注情况"] = followInfo
item["总价"] = totalPrice
item["单价"] = unitPrice
item["图片"] = src_url
items_list.append(item)
return items_list
# 定义方法-->用于对外
def startSpider(self):
houseInfos = []
for page in range(int(self.start),int(self.end)+1):
req = self.handle_url(page)
houseInfos += self.download(req)
house_str = json.dumps(houseInfos)
with open("lianjia.json","w",encoding="utf-8") as fp:
fp.write(house_str)
fp.flush()
def main():
url = "https://sz.lianjia.com/ershoufang/pg"
# url = "https://sz.lianjia.com/ershoufang/"
start = input("请输入开始页: ")
end = input("请输入结束页: ")
# 创建一个spider对象,并开始爬取
spider = LianJiaSpider(url,start,end)
spider.startSpider()
if __name__ == '__main__':
main()
import urllib.parse
from bs4 import BeautifulSoup
import json
class LianJiaSpider(object):
def __init__(self,url,start,end):
self.url = url
self.start = start
self.end = end
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"}
# 定义方法-->处理url
def handle_url(self,page):
# url = "https://sz.lianjia.com/ershoufang/pg2/"
# url = "https://sz.lianjia.com/ershoufang/"
if page == 1:
url = "https://sz.lianjia.com/ershoufang/"
else:
url = self.url + str(page) + "/"
print(url)
req = urllib.request.Request(url=url,headers=self.headers)
return req
# 定义方法-->用于下载
def download(self,request):
res = urllib.request.urlopen(request)
soup = BeautifulSoup(res.read(),"lxml")
contents = soup.select(".content .sellListContent li.clear")
# print(contents)
items_list = []
for content in contents:
item = {}
# print(content)
title = content.select(".title a")[0].get_text()
# print("标题:"+title)
houseInfo = content.select(".houseInfo")[0].get_text()
# print("房屋信息:"+houseInfo)
positionInfo = content.select(".positionInfo")[0].get_text()
# print("楼盘信息:"+positionInfo)
followInfo = content.select(".followInfo")[0].get_text()
# print("关注情况:"+followInfo)
totalPrice = content.select(".priceInfo .totalPrice")[0].get_text()
# print("总价:"+totalPrice)
unitPrice = content.select(".priceInfo .unitPrice")[0].get_text()
# print("单价:"+unitPrice)
src_url = content.select(".lj-lazy")[0]["data-original"]
# print("图片地址:"+src_url)
item["标题"] = title
item["房屋信息"] = houseInfo
item["楼盘信息"] = positionInfo
item["关注情况"] = followInfo
item["总价"] = totalPrice
item["单价"] = unitPrice
item["图片"] = src_url
items_list.append(item)
return items_list
# 定义方法-->用于对外
def startSpider(self):
houseInfos = []
for page in range(int(self.start),int(self.end)+1):
req = self.handle_url(page)
houseInfos += self.download(req)
house_str = json.dumps(houseInfos)
with open("lianjia.json","w",encoding="utf-8") as fp:
fp.write(house_str)
fp.flush()
def main():
url = "https://sz.lianjia.com/ershoufang/pg"
# url = "https://sz.lianjia.com/ershoufang/"
start = input("请输入开始页: ")
end = input("请输入结束页: ")
# 创建一个spider对象,并开始爬取
spider = LianJiaSpider(url,start,end)
spider.startSpider()
if __name__ == '__main__':
main()