爬虫下载深圳新房数据

import re
import sys
import requests
from bs4 import BeautifulSoup

(sys)

sys.setdefaultencoding(‘utf-8’)

def update_data_dict(soup):
data_dict[’__VIEWSTATE’] = soup.find(id=’__VIEWSTATE’)[‘value’]
data_dict[’__EVENTVALIDATION’] = soup.find(id=’__EVENTVALIDATION’)[‘value’]
data_dict[’__EVENTARGUMENT’] += 1
# data_dict[‘AspNetPager1_input’] += 1

def save_data(soup):
with open(‘result1.txt’, ‘a’) as f:
for tr in soup(‘tr’, ‘’):
f.write(’ '.join([td.text for td in tr(‘td’)]) + ‘\n’)

def page_data_url(soup):
“”“获取详情地址”""
data_url = soup(‘a’)
data_url = str(data_url)
# data_url = re.search(r’<a href="./certdetail.aspx?id=43933" target’,data_url)
data_url = re.findall((r’<a href="./(.*?)".target’),data_url)
for data_url1 in data_url:
print(data_url1)
page_data(data_url1)

def page_data(url):
“”“解释详情页”""
url = ‘http://zjj.sz.gov.cn/ris/bol/szfdc/’+url
html = requests.get(url).text
soup = BeautifulSoup(html, ‘lxml’)
# print(soup)
save_data(soup)

def get_next_page_data():
html2 = requests.post(url, data=data_dict).text

soup2 = BeautifulSoup(html2, 'lxml')
# print(soup2)
page_data_url(soup2)
update_data_dict(soup2)
# save_data(soup2)

url = ‘http://zjj.sz.gov.cn/ris/bol/szfdc/’
html1 = requests.get(url).text
soup1 = BeautifulSoup(html1, ‘html.parser’)

text = soup1.find(‘div’, ‘titebox right’)

print(text)

total_page_num = re.search(u’共(\d+)条’, text)

print(total_page_num)

data_dict = dict(
scriptManager2=‘updatepanel2|AspNetPager1’,
__EVENTTARGET=‘AspNetPager1’,
__EVENTARGUMENT=1,
__LASTFOCUS=’’,
__VIEWSTATE=’’,
__VIEWSTATEGENERATOR=‘2A35A6B2’,
__VIEWSTATEENCRYPTED=’’,
__EVENTVALIDATION=’’,
tep_name=’’,
organ_name=’’,
site_address=’’,
ddlPageCount=10,
)
update_data_dict(soup1)

print(soup1)

page_data_url(soup1)
for i in range(int(342)):
get_next_page_data()

展开阅读全文
©️2020 CSDN 皮肤主题: 1024 设计师: 上身试试 返回首页
实付0元
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、C币套餐、付费专栏及课程。

余额充值