#!/usr/bin/env python
# -- coding: utf-8 -
from gethtml import getHtml
from urllib import request
from bs4 import BeautifulSoup
import re
lv=[]
for i in range(108):
print(‘正在抓取’,i+1,‘页’)
url = ‘https://www.dankegongyu.com/room/sh?utm_source=baidu&utm_medium=’
‘cpc&utm_campaign=P%E4%B8%8A%E6%B5%B7-%E7%AB%9E%E5%93%81&utm_content’
‘=%E7%AB%9E-%E8%B5%B6%E9%9B%86&utm_term=%E8%B5%B6%E9%9B%86%E7%A7%9F%E6%88%BF%E7%BD%91&’
‘source=baidu&page=’+str(i+1)
html=getHtml(url)
#print(html)
soup=BeautifulSoup(html,‘html.parser’)
divs=soup.find(‘div’,class_=‘roomlist’)
div=divs.find_all(‘div’,class_=‘r_lbx’)
#print(div)
for each in div:
jiegou=each.find(‘div’,class_=‘r_lbx_cenb’).get_text().strip().split(’|’)
mianji=jiegou[0].rsplit(‘约’)[1].rsplit(‘㎡’)[0]
#print(mianji)
louceng=jiegou[1].rsplit(‘楼’)[0]
tingshi=jiegou[2].strip()
chaoxiang=jiegou[3].split(’\n’)[0]
hezu=jiegou[3].split(’\n’)[1].strip()
juli=each.find(‘div’,class_=‘sub_img’).get_text()
#print(hezu)
yuezu=each.find(‘div’,class_=‘r_lbx_money’).find(‘span’,class_=‘ty_b’).get_text().strip()
#print(yuezu)
href1 = each.find(‘div’, class_=“r_lbx_cena”).find(‘a’).attrs[‘href’]
html1=getHtml(href1)
#print(html1)
soup = BeautifulSoup(html1, ‘html.parser’)
diqu=soup.find(‘div’,class_=‘detail-roombox’).find(“a”).get_text()
lv.append([mianji,louceng,tingshi,chaoxiang,hezu,yuezu,diqu])
import codecs
import csv
def baoLian(lv):
with codecs.open(‘danke.csv’, ‘a’, encoding=‘utf-8’) as csvfile:
write=csv.writer(csvfile)
for i in lv:
write.writerow(i)
baoLian(lv)
1914

被折叠的 条评论
为什么被折叠?



