# -*- coding: utf-8 -*-
"""
Created on Fri Jun 14 17:37:44 2019
@author: User
"""
import re
import sys
from bs4 import BeautifulSoup #beautifulsoup4库使用时是简写的bs4
import requests
import numpy as np
#import string
header={
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36'
}
i_pg = 0
area_dtype=np.dtype([('bianhao',np.str_,50),
('quyu',np.str_,30),
('riqi',np.str_,30),
('nian',np.str_,30),
('yue',np.str_,30),
('ri',np.str_,30),
('zhuangtai',np.str_,30),
('mianji',np.str_,30),
('jiage',np.str_,30),
('yongtu',np.str_,30)
])
global array_area_all
array_area_all = np.array([('title', 'quyu',
'riqi', 'nian',
'yue', 'ri',
'zhuangtai', 'mianji',
'jiage', 'yongtu')], dtype = area_dtype)
def get_page(url):
global i_pg
i_pg += 1
print('页:', str(i_pg))
# print(url)
if i_pg > 50:
sys.exit()
try:
response = requests.get(url, timeout = 30, headers=header)
# 如果状态码不是200 则应发HTTOError异常
response.raise_for_status()
# 设置正确的编码方式
response.encoding = response.apparent_encoding
soup = BeautifulSoup(response.text, 'html.parser')
result_li = soup.find_all(class_=re.compile("rich-table-row.*?"))
# print('result_li:',result_li)
j = 0
# #处理当前页面房源链接
for row_text in result_li:
# 这里每个列表页只处理前4个房源链接,是为了家加快调试
j = j + 1
# 编号
title_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id92")})
global title
title = title_str.text.lstrip()
# print('编号:',title)
# 区域
quyu_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id98")})
global quyu
quyu = quyu_str.text.lstrip()
# print('区域:',quyu)
# 成交价
chengjiaojia_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id107")})
global chengjiaojia
chengjiaojia = chengjiaojia_str.text.lstrip().replace('万元','',1)
chengjiaojia = my_strip(chengjiaojia)
if len(chengjiaojia) == 0:
chengjiaojia = '0'
# print('成交价:',chengjiaojia)
# 成交时间
chengjiao_date_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
global chengjiao_date
chengjiao_date = chengjiao_date_str.text.lstrip()
#年
# nian_str = chengjiao_date.find('td',{'id':re.compile("j_id46:\d+:j_id116")})
global nian
nian = chengjiao_date[0:4]
# print('年:', nian)
global yue
yue = chengjiao_date[5:7]
# print('月:', yue)
global ri
ri = chengjiao_date[8:10]
# print('日:', ri)
# print('成交时间:',chengjiao_date)
# 成交状态
chengjiao_state_str = row_text.find('td',{'id':re.compile("j_id46:\d+:j_id119")})
global chengjiao_state
chengjiao_state = chengjiao_state_str.text.lstrip()
# print('成交状态:',chengjiao_state)
detail_href = row_text.find('a', {'id': re.compile("j_id46:\d+:j_id124")})
detail_url = 'http://****' + detail_href.attrs['href']
get_page_detail(detail_url)
# 新的数组
global area_new
area_new = np.array([(title, quyu,
chengjiao_date, nian,
yue,ri,
chengjiao_state,
area, chengjiaojia,
use_1)], dtype = area_dtype)
# print('area_new:', area_new)
# 数组合并
global array_area_all
array_area_all = np.vstack((array_area_all, area_new))
# 下一页的爬取
result_next_page = 'http://****?firstResult='+str((i_pg - 1) * 20) +'&priceUnit=TotalPrice&logic=and'
if len(result_next_page) != 0:
# 函数进行递归
get_page(result_next_page)
else:
print('没有下一页了')
return response.text
except:
return '产生异常!'
#进行字符串中空格,换行,tab键的替换及删除字符串两边的空格删除
def my_strip(s):
return str(s).replace(" ", "").replace("\n", "").replace("\t", "").strip()
#由于频繁进行BeautifulSoup的使用,封装一下
def my_Beautifulsoup(response):
return BeautifulSoup(str(response), 'html.parser')
# 详细页面的爬取
def get_page_detail(url):
response = requests.get(url, headers=header)
# print("9999999")
if response.status_code == 200:
soup = BeautifulSoup(response.text, 'html.parser')
# 宗地面积
area_all = soup.find(attrs={'id': 'j_id242'})
area_2 = area_all.find_all('span', {'class': 'layout'})[1]
global area
area = area_2.text.lstrip().replace('平方米','',1)
area = my_strip(area)
if len(area) == 0:
area = '0'
# print('面积:'+ area)
# 用途
use_all = soup.find(attrs={'id': 'j_id267'})
use_2 = use_all.find_all('span', {'class': 'layout'})[1]
global use_1
use_1 = use_2.text.lstrip()
use_1 = my_strip(use_1)
# print('用途:'+ use_1)
# =============================================================================
get_page('http://****')
print(array_area_all)
np.savetxt('data\\zl.csv', array_area_all, delimiter=',',
fmt="%s")