#-*-coding:utf-8-*-
importurllibimporturllib2importreimportrequestsimportjsonimportlxmlfrom bs4 importBeautifulSoupimporttimefrom pymongo importMongoClientfrom lxml importetree
client= MongoClient('localhost',27017)
db=client.test
House=db.House
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate, br','Accept-Language':'zh-CN,zh;q=0.9','Cache-Control':'max-age=0','Connection':'keep-alive','Cookie':'......','Host':'bj.lianjia.com','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
URL= 'https://bj.lianjia.com/ershoufang/pg'
defdownload(url):
num_try= 2
while num_try >0:
num_try-= 1
try:
content= requests.get(url,headers =headers)returncontent.textexcepturllib2.URLError as e:print 'Download error',e.reasonreturnNonedefget_message(url):
html=download(url)
soup= BeautifulSoup(html,'html.parser')
prices= soup.find_all('div','priceInfo')
total_price=[]for each inprices:
total_price.append(each.span.string)
address=[]
house_types=[]
areas=[]
towards=[]
decorates=[]
elevates=[]
message= soup.find_all('div',attrs={'class':'houseInfo'})for each inmessage:
List= each.get_text().split('|')
address.append(List[0].strip())
house_types.append(List[1].strip())
areas.append(List[2].strip())
towards.append(List[3].strip())
decorates.append(List[4].strip())if len(List) == 5:
elevates.append("None")else:
elevates.append(List[5].strip())for addres,house_type,area,price,toward,decorate,elevate inzip(address,house_types,areas,total_price,towards,decorates,elevates):
mess= "{\"Address\":\"%s\",\"House_type\":\"%s\",\"Area\":\"%s\",\"Price\":\"%s\",\"Toward\":\"%s\",\"Decorate\":\"%s\",\"Elevete\":\"%s\"}"%(addres,house_type,area,price,toward,decorate,elevate)printmess
message=json.loads(mess)
House.insert(message)if __name__ == '__main__':
t=time.time()printtfor num in xrange(1,101):
url= URL +str(num)printurl
get_message(url)
time.sleep(1)
t1=time.time()print 'Total time:'
print t1 - t - 100