python爬虫-ajk

#!/usr/bin/env python
#coding=utf8
from bs4 import BeautifulSoup
from bs4 import SoupStrainer
import urllib,os,re,random,string,time,sys
import urllib.request
import sqlite3
import pymysql
from urllib import request
import threading

引入函数

from formatting import formatting

dbcur=""
dbpath = os.getcwd()
print(dbpath)

try:
# 也可以使用关键字参数
dbcur = pymysql.connect(host=‘120.25.69.26’, port=3306, user=‘ls_spider’, passwd=‘kmlskj0001*’, db=‘ls_spider’, charset=‘utf8’)
# # 创建一个游标
cursor = dbcur.cursor()
# ziduan = ‘year,area,name,type,totlafloor,floor,fitmant,toward,address,price,totalprice,url,ischeck’

except Exception as e:
print( ‘\n occurred exception error %s’ % (e))
exit()

创建目录

def mkdir(path):
path = path.strip()
path = path.rstrip("\")
if not os.path.exists(path):
os.makedirs(path)
return True

创建一个字典用于存放拍卖的类型

dictl = {
‘housing_Auction’:‘https://km.anjuke.com/sale/’#住宅
}

采集住宅用地的信息

def getHousing(str):
global dictl
for a in range(1,4):
try:
a = a.str()
print(a)
housing = str+‘p’+a+’/’
print(housing)
header = {‘User-Agent’: “Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3”}
while True:
req = request.Request(housing, headers=header)
print(header)
# 传入创建好的Request对象
response = request.urlopen(req)
# 读取响应信息并解码
html = response.read().decode(‘utf-8’)
soup = BeautifulSoup(html, ‘html.parser’)
# print(html)
lists = []
listss = soup.find_all(‘a’, attrs={‘class’, ‘houseListTitle’})
# housing = soup.find(’’, attrs={‘class’, ‘aNxt’})[‘href’]
# hou.append(housing)
# print(housing)
# 把各个房子的链接取出来放到lists里边
for i in range(0, len(listss)):
try:
lists.append(listss[i][‘href’])
except Exception as e:
print(’\n get key exception occurred. %s’ % (e))
# 循环每个链接把内容取出来
for j in range(0, len(lists)):
try:
time.sleep(1)
list_req = request.Request(lists[j], headers=header)
res = request.urlopen(list_req)
list_html = res.read().decode(‘utf-8’)
list_soup = BeautifulSoup(list_html, ‘html.parser’)
# print(list_soup)
parameter = list_soup.find_all(‘div’, attrs={‘class’, ‘houseInfo-content’})
# print(parameter)
year = parameter[6].get_text().strip()
year = re.sub(’\D’, ‘’, year)
area = parameter[4].get_text().strip()
area = re.findall(r"\d+.?\d*", area)
name = parameter[0].find(‘a’).get_text().strip()
typess = parameter[1].get_text().strip().replace(’\n’, ‘’)
pattern = re.compile(r’\s+’)
types = re.sub(pattern, ‘’, typess)
# print(type(types))
totalf = parameter[10].get_text().strip()
time.sleep(2)
totalfloor = re.sub(’\D’, ‘’, totalf)
floor = totalf[0:2]
fitment = parameter[11].get_text().strip()
toward = parameter[7].get_text().strip()
addre = parameter[3].find(‘p’).get_text().strip().replace(’\n’, ‘’).replace(’ ‘, ‘’)
addr = parameter[3].find_all(‘a’)[0].get_text().strip().replace(’\n’, ‘’).replace(’ ‘, ‘’)
addes = parameter[3].find_all(‘a’)[1].get_text().strip().replace(’\n’, ‘’).replace(’ ‘, ‘’)
address = addre + addr + addes
price = parameter[2].get_text().strip()
price = re.sub(’\D’, ‘’, price)
totalpric = list_soup.find(‘span’, attrs={‘class’, ‘light info-tag’})
totalprice = totalpric.find(‘em’).get_text().strip()
onlyone = parameter[16].get_text().strip()
nature = parameter[15].get_text().strip()
numberyear = parameter[12].get_text().strip()
url = lists[j]
formfloor = ‘’
print(year, area, name, types, totalfloor, floor, fitment, toward, address, price, totalprice,onlyone, nature, numberyear, url)
formatting(year, area, name, types, totalfloor, floor, fitment, toward, address, price,totalprice, onlyone, nature, numberyear, url)
print(formfloor, fitment, toward, totalprice, price, area)

                except Exception as e:
                    print('\n get key excption occurred. %s' % (e))
            if soup.title.text.strip() == u'请输入验证码':
                import webbrowser
                webbrowser.open(housing)
                time.sleep(3)
            else:
                break
    # 如果上边的语句出现异常则跳出函数
    except Exception as e:
        print('\n get key exception occurred. %s' % (e))

getHousing(dictl[‘housing_Auction’])

设定时间定时开启

times = ‘03’
def fun_time(urls):
param = urls
while True:
# 输出线程名称
names = threading.current_thread().name
sj = time.strftime(’%H’, time.localtime(time.time()))
print(‘thread %s every day %s is running…’ % (threading.current_thread().name, str(sj)))

    while sj == times:
        getHousing(param)
        break
    # 让循环休息10分钟
    time.sleep(1000 * 2)

housin = threading.Thread(target=fun_time,name=‘housin’, args=(dictl[‘housing_Auction’],))
housin.start()
housin.join()

cursor.close()

关闭数据库连接

dbcur.close()
exit(0)
print(‘done’)
os.system(“pause”)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值