import requests
from pyquery import PyQuery as pq
import re
import pandas
import datetime
def save_as_csv(houses_info_list):
houses_df = pandas.DataFrame(houses_info_list)
houses_df.to_csv('{}上海二手房信息.csv'.format(datetime.date.today().strftime('%Y-%m-%d')))
def get_house_info_list(url):
domain = url+'{}'
html = requests.get(url).text
house_items = pq(html).find('.houseList > .list > .info').items()
house_list = []
for house in house_items:
next_url = domain.format(house.find('.title > a').attr('href'))
house_info = get_house(next_url)
house_list.append(house_info)
return house_list
def get_house(url):
info = {}
html = requests.get(url).text
doc = pq(html)
info['标题'] = doc.find('#lpname').text()
info['价格'] = doc.find('div.trl-item.sty1').text()
items = doc.find('.trl-item1').items()
for item in items:
value, key = item.text().strip().split()
info[key] = value
items = doc.find('.trl-item2').items()
for item in items:
key = ''.join(item.find('.lab').text().split())
value = ''.join(item.find('.rcont').text().split())
info[key] = value.replace('地图', '')
info['联系人'] = doc.find('#agentname').text()
info['联系方式'] = doc.find('#mobilecode').text()
items = doc.find('.qu_bianqu1 > .text-item').items()
for item in items:
key = item.find('.lab').text()
value = item.find('.rcont').text()
if key == '挂牌时间':
value = re.compile(r'(\d{4}-\d{2}-\d{2})').search(value).group(1)
info[key] = value
print(info)
return info
if __name__ == '__main__':
houses_info_list = get_house_info_list('http://esf.sh.fang.com/')
save_as_csv(houses_info_list)
python - pyquery 房天下 spider
最新推荐文章于 2023-09-02 23:28:27 发布