无不良目的,纯学习
策略:
1、增量爬取二手房成交数据,最多3000条,所以每天直接增量爬取即可。
2、老数据有几种方式,我用的并不是最优的,先从安居客爬取所有小区入库(安居客反爬比较强,锁也是增量爬取),链家查询每个小区成交房源数据。
3、还有更好的策略,只是练习,所以没有完善,总共5万多数据,抓了4万。
代码:
增量代码:
import requests
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 增量抓取所有链家房源
class Lianjia:
def __init__(self):
self.web_requests = WebRequests()
self.mc = Mc()
def run(self):
url_one = "https://xa.lianjia.com/chengjiao/pg1/"
response = self.web_requests.get(url_one)
selector = Selector(text=response.text)
url = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-url").extract_first()
page = selector.xpath("//div[@class='page-box house-lst-page-box']/@page-data").extract_first()
page_dic = eval(page)
total_page = page_dic.get('totalPage')
curPage = page_dic.get('curPage')
while curPage<=total_page:
time.sleep(1)
next_url = parse.urljoin(response.url, url.format(page=str(curPage)))
print('===url:{}'.format(next_url))
r = self.web_requests.get(next_url)
selector = Selector(text=r.text)
ul = selector.xpath("//ul[@class='listContent']/li")
for li in ul:
# 小区名 户型 面积
title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()
a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()
house_id = int(re.match('.*?(\d+).*', a).group(1))
# 朝向
position = li.xpath('.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()
money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first()) # 总价
money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first()) # 单价
success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first() # 成交日期
success_data = datetime.datetime.strptime(success_data,'%Y.%m.%d')
link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first() # 房屋链接
try:
name,house_type,size = title.split(' ')
except Exception as e:
print('====error,title:{}'.format(title))
continue
img = li.xpath('.//a/img/@src').extract_first()
house_size = float(size.replace('平米',''))
sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)
r = self.mc.query(sql)
if not r:
sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(
house_id, name, house_type, house_size, money_all, money_every, success_data,img,link)
print(sql)
self.mc.insert(sql)
curPage+=1
if __name__ == '__main__':
Lianjia().run()
抓取安居客房源数据
from utils.common import WebRequests, Mc
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import re
import time
from scrapy import Selector
from utils.common import WebRequests, Mc
# 安居客抓取所有小区
class Home:
def __init__(self):
self.web_requests = WebRequests()
self.mc = Mc()
def run(self):
url_one = "https://xa.anjuke.com/community/"
response = self.web_requests.get(url_one)
selector = Selector(text=response.text)
urls = selector.xpath("//div[@class='div-border items-list']//div[1]/span[2]/a/@href").extract()
positions = []
for url in urls[15:]:
position = re.match(r'https://xa.anjuke.com/community/(.*)/', url).group(1)
positions.append(position)
print(positions)
anjuke_url = 'https://xa.anjuke.com/community/'
for position in positions[1:]:
url = anjuke_url + position + '/p{}'
response = self.web_requests.get(url.format(1))
selector = Selector(text=response.text)
counts = selector.xpath("//div[@class='sortby']/span/em[2]/text()").extract()
if counts and int(counts[0]) == 0:
continue
try:
page_count = int(int(counts[0]) / 30)
except Exception as e:
print(e)
print(counts)
for page in range(1, page_count + 1):
print('====position:{},page:{}'.format(position, page))
time.sleep(1)
response = self.web_requests.get(url.format(page))
selector = Selector(text=response.text)
homes = selector.xpath("//div[@class='list-content']/div")
for item in homes[1:]:
home = item.xpath('.//div[@class="li-info"]/h3/a/text()').extract_first()
home = home.replace(' ', '').replace('\n', '')
quyu = item.xpath('.//div[@class="li-info"]/address/text()').extract_first().replace(' ',
'').replace(
'\n', '')
price = item.xpath('.//div[@class="li-side"]/p/strong/text()').extract_first().replace('\n', '')
sql = "select * from xian_home where home='{}'".format(home)
r = self.mc.query(sql)
if not r:
sql = "insert into xian_home (home,position,money_every) values ('{}','{}',{})".format(home, quyu, price)
self.mc.insert(sql)
else:
sql = "update xian_home set money_every={} where home='{}'".format(price,home)
if __name__ == '__main__':
Home().run()
通过安居客房源数据在链家上爬取小区所有房源
import urllib
import sys
import os
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import requests
from utils.common import WebRequests,Mc
from scrapy import Selector
import re
import datetime
from urllib import parse
import time
# 通过库里的小区名,在链家上查询所有成交房源
class Lianjia:
def __init__(self):
self.web_requests = WebRequests()
self.mc = Mc()
def get_home(self):
# 库里查询所有小区
sql = 'select home from xian_home'
homes = self.mc.query(sql)
return homes
def run(self):
homes = self.get_home()
for idx,home in enumerate(homes):
url_first = 'http://xa.lianjia.com/chengjiao/pg1rs{}'.format(home[0])
# url_first = url_first.decode('gbk', 'replace')
# url_first = urllib.quote(url_first.encode('utf-8', 'replace'))
response = self.web_requests.get(url_first)
selector = Selector(text=response.text)
count = selector.xpath('//div[@class="total fl"]/span/text()').extract_first()
if count:
count = int(count.replace(' ',''))
pages = int(count/30)+1
else:
continue
if pages>50:
continue
for page in range(1,pages+1):
time.sleep(1)
url = 'http://xa.lianjia.com/chengjiao/pg{}rs{}/'.format(page,home[0])
response = self.web_requests.get(url)
selector = Selector(text=response.text)
items = selector.xpath("//ul[@class='listContent']/li")
for li in items:
try:
title = li.xpath('.//div[@class="info"]/div[@class="title"]/a/text()').extract_first()
if '车位' in title:
continue
a = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first()
house_id = int(re.match('.*?(\d+).*', a).group(1))
if house_id == 101109708199:
print('here')
# 朝向
position = li.xpath(
'.//div[@class="info"]/div[@class="address"]/div[@class="houseInfo"]/text()').extract_first()
money_all = int(li.xpath('.//div[@class="totalPrice"]/span/text()').extract_first()) # 总价
money_every = int(li.xpath('.//div[@class="unitPrice"]/span/text()').extract_first()) # 单价
success_data = li.xpath('.//div[@class="dealDate"]/text()').extract_first() # 成交日期
success_data = datetime.datetime.strptime(success_data, '%Y.%m.%d')
link = li.xpath('.//div[@class="info"]/div[@class="title"]/a/@href').extract_first() # 房屋链接
try:
name, house_type, size = title.split(' ')
except Exception as e:
print('====error,title:{}'.format(title))
continue
img = li.xpath('.//a/img/@src').extract_first()
try:
house_size = float(size.replace('平米', ''))
except Exception as e:
print('====error,title:{}'.format(title))
sql = 'select * from lianjia_ershoufang_xian where house_id={}'.format(house_id)
r = self.mc.query(sql)
if not r:
sql = "insert into lianjia_ershoufang_xian (house_id,name,house_type,house_size,money_all,money_every,success_data,img,link) values ({},'{}','{}',{},{},{},'{}','{}','{}')".format(
house_id, name, house_type, house_size, money_all, money_every, success_data, img, link)
print(sql)
self.mc.insert(sql)
except Exception as e:
continue
if __name__ == '__main__':
Lianjia().run()
工具model(数据库信息隐藏)
import pymysql
import sys
class Mc:
'''
类Mc:把mysql的一些操作封装成类
ExcQuery(sql):查找,返回类型:tuple
ExcUpdate(sql):增删改,出错时输出错误信息
用法:
mc = Mc()
sql = "SELECT * FROM `biaotiku`"
data = mc.ExcQuery(sql)
for i in data:
print(i)
sql="INSERT INTO `biaotiku` (`id`, `text`, `beizhu`) VALUES (NULL, 'test', '123')"
mc.ExcUpdate(sql)
'''
def __init__(self, db_host="xxx.xxx.xxx.xxx", username="xxx", pw="xxx", dbname="spider"):
self.db_host = db_host
self.username = username
self.pw = pw
self.dbname = dbname
self.db = pymysql.connect(self.db_host, self.username, self.pw, self.dbname)
self.cursor = self.db.cursor()
def query(self, sql):
self.cursor.execute(sql)
r = self.cursor.fetchall()
if r:
return list(r)
else:
return []
def update(self, sql):
try:
self.cursor.execute(sql)
self.db.commit()
except:
print(sys.exc_info())
def insert(self, sql):
try:
self.cursor.execute(sql)
self.db.commit()
except:
print(sys.exc_info())
def __del__(self):
self.db.close()
if __name__ == '__main__':
r = Mc().query('select * from proxy_ip where id=3;')
if r:
print(r)
工具common
import random
import requests
import time
from utils.model import Mc
class WebRequests:
def __init__(self):
self.ips = []
sql = 'select ip from proxy_ip where is_delete=0;'
all = Mc().query(sql)
for ip in all:
self.ips.append(ip[0])
@property
def user_agent(self):
"""
return an User-Agent at random
:return:
"""
from fake_useragent import UserAgent
ua = UserAgent()
return ua.random
@property
def header(self):
"""
basic header
:return:
"""
return {'User-Agent': self.user_agent,
'Accept': '*/*',
'Connection': 'keep-alive',
'Accept-Language': 'zh-CN,zh;q=0.8'}
@property
def proxy(self):
return random.choice(self.ips)
# return self.ips[random.randint(1, len(self.ips) + 1)]
def get(self, url, header=None, retry_time=1, retry_interval=5, timeout=10, *args, **kwargs):
"""
get method
:param url: target url
:param header: headers
:param retry_time: retry time
:param retry_interval: retry interval
:param timeout: network timeout
:return:
"""
headers = self.header
if header and isinstance(header, dict):
headers.update(header)
proxies = {"http": "http://" + str(self.proxy)}
i = 0
while True:
try:
# r = requests.get(url, headers=headers, proxies=proxies,timeout=timeout)
r = requests.get(url, headers=headers,timeout=timeout)
i = 0
return r
except Exception as e:
i+=1
print('====请求失败,{}s后重试{}次'.format(retry_time,i))
time.sleep(retry_time)
if i==retry_interval:
print('====请求失败,请检查:{}'.format(url))