今天闲来无事帮同学抓取了一个房地产项目的数据网站
在这里插入图片描述
就是这样一个页面
当你输入关键字的时候,在network你会发现这样一个
继续往下翻
看看preview
有趣啊
实际操作一波
(实际上我当时写代码的时候发现 XHR里面根本就没有数据传送过来,所以我果断用了selenium,弱智了弱智了)
简单的发送一个请求
可以看到我们所需要的数据都在里面(有时候就是这样,找不到接口的时候就苦逼的很),后面就是提取我们所需要的数据了
我还是放一下之前的代码吧,其实还行
from selenium import webdriver
from selenium.webdriver.support.select import Select
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
import time,requests
from lxml import etree
import csv,re
import os
def get_building_info(name, urls):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
name = name.strip()
# os.mkdir(name)
num = 1
for url in urls:
response = requests.get(url, headers=headers).text
html = etree.HTML(response)
trs = html.xpath('//table[@class="dataintable"]//tr[position()>1]')
with open(name + '楼盘' + str(num) + '.csv', 'w', newline='') as file:
csvfile = csv.writer(file)
for tr in trs:
info = []
tds = tr.xpath('./td[position()<7]')
for td in tds:
text = td.xpath('./text()')[0]
info.append(text)
csvfile.writerow(info)
num += 1
# get_building_info('tgj123','http://tp.tangshan.gov.cn:8090/wsysbudinghouse.jspx?item_code=00001492&build_code=0207')
def get_item_info(item, urls1, urls2):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36'
}
informations = []
for url in urls1:
info = []
response = requests.get(url, headers = headers).text
html = etree.HTML(response)
trs = html.xpath('//table[@class="dataintable"]//tr')
for tr in trs:
tds = tr.xpath('./td[position()=((position() mod 2)=0)]')
for td in tds:
text = td.xpath('./text()')
info.append(text[0])
informations.append(info)
ppx = []
loupan_urls = []
for url in urls2:
ins = []
response = requests.get(url, headers = headers).text
html = etree.HTML(response)
tds = html.xpath('//table[@class="dataintable"]//tr[last()]/td[position()<7]')
base_url = 'http://tp.tangshan.gov.cn:8090' + html.xpath('//table[@class="dataintable"]//tr[last()]/td[last()]/a/@href')[0]
loupan_urls.append(base_url)
get_building_info(item, loupan_urls)
for td in tds:
text = td.xpath('./text()')[0]
ins.append(text)
ppx.append(ins)
informations = list(map(lambda info1, info2:info1 + info2, informations, ppx))
# informations = list(map(lambda info1, loupan:info1 + loupan, informations, loupan_urls))
return informations
chrome_options = Options()
chrome_options.add_argument('--headless')
driver = webdriver.Chrome(chrome_options = chrome_options, executable_path='C:\\Program Files (x86)\\Google\\Chrome\\Application\\CHROME\\chromedriver.exe')
driver.get('http://tp.tangshan.gov.cn:8090/wsyscx.jspx')
time.sleep(1)
# driver.switch_to.frame(0)
se = driver.find_element_by_id('type')
key = input('请输入预售查询的类型,为 预售证 还是 项目名称(如果你输入的不是预售证,则默认为后者): ')
if key == '预售证':
Select(se).select_by_index(1)
yushouzheng = input('请输入预售证:')
driver.find_element_by_name('typeval').send_keys(yushouzheng)
time.sleep(1)
search = driver.find_element_by_xpath('//*[@id="container"]/div/div/form/table/tbody/tr[2]/td[3]/input').click()
html = etree.HTML(driver.page_source)
urls1 = html.xpath('//table[@class="dataintable"]//tr[position()>1]/td[5]/a/@href')
urls2 = html.xpath('//table//tr[position()>1]/td[last()]/a/@href')
if urls1 == None:
driver.close()
urls1 = list(map(lambda url: 'http://tp.tangshan.gov.cn:8090' + url, urls1))
urls2 = list(map(lambda url: 'http://tp.tangshan.gov.cn:8090' + url, urls2))
informations = get_item_info(urls1, urls2)
item = item.strip()
with open(yushouzheng + '.csv', 'w', newline='') as file:
headers = ['预售证号', '售房单位', '项目名称', '预售总面积', '房屋坐落位置', ' 预售套数', '发证日期', '预售范围', '发证机关', '预售对象', '栋号', '预售许可证',
'总套数', '总面积', '总层数', '状态', '楼盘']
csvfile = csv.writer(file)
csvfile.writerow(headers)
for city in informations:
csvfile.writerow(city)
else:
Select(se).select_by_index(1)
item = input('请输入项目名称:')
driver.find_element_by_name('typeval').send_keys(item)
time.sleep(1)
search = driver.find_element_by_xpath('//*[@id="container"]/div/div/form/table/tbody/tr[2]/td[3]/input').click()
html = etree.HTML(driver.page_source)
urls1 = html.xpath('//table[@class="dataintable"]//tr[position()>1]/td[5]/a/@href')
urls2 = html.xpath('//table//tr[position()>1]/td[last()]/a/@href')
if urls1 == None:
driver.close()
urls1 = list(map(lambda url:'http://tp.tangshan.gov.cn:8090' + url, urls1))
urls2 = list(map(lambda url:'http://tp.tangshan.gov.cn:8090' + url, urls2))
informations = get_item_info(item, urls1, urls2)
item = item.strip()
with open(item + '.csv', 'w', newline='') as file:
headers = ['预售证号','售房单位','项目名称','预售总面积','房屋坐落位置',' 预售套数','发证日期','预售范围','发证机关','预售对象','栋号','预售许可证','总套数','总面积','总层数','状态','楼盘']
csvfile = csv.writer(file)
csvfile.writerow(headers)
for city in informations:
csvfile.writerow(city)