3.16 Python爬虫 (四)
全部信息:
In [10]:
house_url = 'https://bj.lianjia.com/zufang/101102627382.html'
soup = get_page(house_url)
price = soup.find('span',class_='total').text
unit = soup.find('span',class_='unit').text.strip()
house_info = soup.find_all('p')
area = house_info[0].text[3:]
layout = house_info[1].text[5:]
floor = house_info[2].text[3:]
direction = house_info[3].text[5:]
subway = house_info[4].text[3:]
community = house_info[5].text[3:]
location = house_info[6].text[3:]
create_time = house_info[7].text[3:]
agent = soup.find('a',class_ = 'name LOGCLICK')
#agent_name = agent.text #和上面的重复
agent_id = agent.get('data-el')
evaluate = soup.find_all('div',class_='evavate')
score,number = evaluate.find('span',class_='rate').text.split('/')
times = evaluate.find('span',class_='time').text[5:-1]
info = {
'价格' : price,
'单位' : unit,
'面积' : area,
'户型' : layout,
'楼层' : floor,
'朝向' : direction,
'发布时间' : create_time,
'地铁' : subway,
'小区' : community,
'位置' : location,
'经纪人名字':agent_name
#'经纪人id':agent_id
}
info
#结果是:
'价格' : ……
'单位' : ……
'面积' : ……
………………
#封装代码:
#获取url下的页面内容
def get_page(url):
responce = requests.get(url)
soup = BeautifulSoup(responce.text,'lxml')
return soup
#封装成列表,作用是获得列表页下面的所有租房页面的链接,返回一个链接列表
def get_links(url):
soup = get_page(link_url)
links_div = soup.find_all('div',class_="pic-panel")
links = [div.a.get('href')for div in links_div]
return links
#获取租房的信息
def get_house_info(house_url):
house_url = 'https://bj.lianjia.com/zufang/101102627382.html'
soup = get_page(house_url)
price = soup.find('span',class_='total').text
unit = soup.find('span',class_='unit').text.strip()
house_info = soup.find_all('p')
area = house_info[0].text[3:]
layout = house_info[1].text[5:]
floor = house_info[2].text[3:]
direction = house_info[3].text[5:]
subway = house_info[4].text[3:]
community = house_info[5].text[3:]
location = house_info[6].text[3:]
create_time = house_info[7].text[3:]
agent = soup.find('a',class_ = 'name LOGCLICK')
#agent_name = agent.text #和上面的重复
agent_id = agent.get('data-el')
evaluate = soup.find_all('div',class_='evavate')
score,number = evaluate.find('span',class_='rate').text.split('/')
times = evaluate.find('span',class_='time').text[5:-1]
info = {
'价格' : price,
'单位' : unit,
'面积' : area,
'户型' : layout,
'楼层' : floor,
'朝向' : direction,
'发布时间' : create_time,
'地铁' : subway,
'小区' : community,
'位置' : location,
'经纪人名字':agent_name
#'经纪人id':agent_id
}
return info
缩进:选中 + Tab键
#获取数据后,就要往数据库里存储
#本节用到的数据库, 因为没有 MySQLdb 运行报错
In [243]:
import time
import MySQLdb
import requests
#还可以生成一个函数:
In [166]:
DATABASE = {
'host':'127.0.0.1' #如果是远程数据库,此处为远程服务器的 ip 地址
'database':'Examination',
'user':'root',
'password':'wangwei'
'charset':'utf8mb4'
}
def get_db(setting): # setting : 配置文件
return MySQLdb.connect(**setting) #返回的就是db
def insert(db,house_info): #获取了db,插入操作
sql = '' #要有一个sql语句
cursor = db.cursor() #要有一个游标,游标 = db
cursor = db.excute(sql) #游标去执行excute语句
db.comit() #然后执行db.comit
关键是怎么写sql语句,它可能很多,要用三引号
In [1]:
DATABASE = {
'host':'127.0.0.1',
'database':'Lianjia', # 数据库是 lianjia
'user':'root',
'password':'wangwei',
'charset':'utf8mb4',
}
def get_db(setting):
return MySQLdb.connect(**setting)
def insert(db,house_info):
values = "'{}'," * 11 + "'{}'"
sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'],house['楼层'],house['朝向'],house['地铁'],house['小区'],house['位置'],house['经纪人名字'],house['经纪人id'], house['发布时间'])
sql = """
insert into 'house'('price','unit','area','layout','floor','direction','subway','community','location','agent_name','agent_id',' create_time ',values({},)
#创建的表格名称是house,后面括号里跟着字段
""".format(sql_values) #上面是一个字符串,用 .format
print(sql)
cursor = db.cursor()
cursor.execute(sql) # 执行的就是 sql
db.commit()
3.17 Python爬虫 (五)
#根据上面的,这里是可以运行出来的,任意一个租房网址,就可获得一套房子的租房信息
#为了避免链家封锁访问,先运行一次,后面不再输入
In [ ]:
#直接输入
db = get_db(DATABASE)
#获取了一个列表
links = get_links('https://bj.lianjia.com/zufang/')
#for循环
for link in links:
time.sleep(2)
house = get_house_infor(link)
print('获取一个房子信息成功')
inser(db,house)