# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 10:10:22 2019
@author:
"""
# -*- coding: utf-8 -*-
import bs4
import requests
import time#引入time,计算下载时间
import socket
from urllib import error
def open_url(url):
try:
hd = {}
hd['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
r = requests.get(url,headers=hd,timeout=10)
s = requests.session()
s.keep_alive = False
except error.URLError as e:
if isinstance(e.reason,socket.timeout):
# 下面这里写自己的业务逻辑
print('超时,执行下一个请求')
return r
host = 'https://wx.lianjia.com/ershoufang/binhu/pg'
whvj = []
ee = []
aa = []
bb = []
cc = []
dd = []
ff = []
count = 36
start = time.time()
size = 1
q = 101
while count < q:
url = host + str(count)
r = open_url(url)
soup = bs4.BeautifulSoup(r.text,'html.parser')
count = count + 1
targets = soup.find_all('a',class_="img")
for each in targets:
whvj.append(each['href'])
print('\r'+"已经下载:"+int(count/q*100)*"█"+"【"+str(round(float(count/q)*100,2))+"%"+"】",end="")
# print(url)
# print(whvj)
whvj = list(set(whvj))
count1 = 0
response = requests.get(url,stream = True)#stream参数设置成True时,它不会立即开始下载,当你使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载
chunk_size = 1024#每次块大小为1024
content_size = int(len(whvj))
for i in whvj:
try:
soup1 = bs4.BeautifulSoup(open_url(i).text,'html.parser')
title = soup1.find_all('h1',class_='main')
aa.append(title[0].text.split())
xbxi = soup1.find_all("div",class_="content")
bb.append(xbxi[2].text.split())
# cc.append(xbxi[3].text.split())
dd.append(soup1.find_all("span",class_="info")[0].text.split())
jxge = soup1.find_all('div',class_='price')
for i in jxge:
ee.append(i.text)
xnqu = soup1.find_all('div',class_ = 'fl l-txt')
gg = []
for i in xnqu:
gg.append(i.text.split())
ff.append(gg[0][8])
size = size +1
print('\r'+"已经下载:"+int(size/content_size*100)*"█"+" 【"+str(round(size/chunk_size/1024,2))+"MB】"+"【"+str(round(float(size/content_size)*100,2))+"%"+"】",end="")
except OSError:
pass
continue
##
result = []
length =len(whvj)
for i in range(length):
result.append(str(dd[i])+'^' +str(aa[i])+'^'+str(whvj[i])
+ '^' + str(ee[i]) + '^' + str(ff[i])
+ '^' + str(bb[i]) + '^' + '\n')
end = time.time()
print("总耗时:"+str(end-start)+"秒")
#data_count = 0
with open('bbhu1.txt','w',encoding='utf-8') as f:
for each in result:
f.write(each)
获取经纬度
import json
from urllib.request import urlopen, quote
import pandas as pd
import csv
xlsx_1 = pd.ExcelFile('bbhu.xlsx')
data1 = xlsx_1.parse('Sheet2')
url = 'http://api.map.baidu.com/geocoder/v2/'
output = 'json'
ak = 'raSWR0VKik7******shHzFH'
s = data1['小区']
jkdu = []
wwdu = []
# 创建一个worksheet
for i in s:
try:
add = quote(i)
uri = url + '?' + 'address=' + add + '&output=' + output + '&ak=' + ak #百度地理编码API
req = urlopen(uri)
res = req.read().decode()
temp = json.loads(res)
jkdu.append(temp['result']['location']['lng'])
wwdu.append(temp['result']['location']['lat'])#打印出经纬度
except OSError:
pass
continue
result = []
length = len(jkdu)
for i in range(length):
result.append(str(s[i])+' '+str(jkdu[i])+' '+str(wwdu[i]))
with open('xiaoqujkww.csv', 'a', newline='', encoding='utf-8')as f:
write = csv.writer(f)
for data in result:
write.writerow([data])
f.close()