简单爬虫
python库
1、request 用来获取页面内容
2、BeautifulSoup
文档链接:https://www.crummy.com/software/BeautifulSoup/bs4/doc/index.zh.html
爬取链家网的信息
安装第三方库
pip install requests
pip install bs4
新建数据库:
CREATE DATABASE /*!32312 IF NOT EXISTS*/`house` /*!40100 DEFAULT CHARACTER SET utf8 */;
USE `house`;
/*Table structure for table `db_house` */
DROP TABLE IF EXISTS `db_house`;
CREATE TABLE `db_house` (
`id` int(11) unsigned NOT NULL AUTO_INCREMENT,
`price` varchar(80) DEFAULT NULL,
`unit` varchar(80) DEFAULT NULL,
`area` varchar(80) DEFAULT NULL,
`layout` varchar(80) DEFAULT NULL,
`floor` varchar(80) DEFAULT NULL,
`direction` varchar(80) DEFAULT NULL,
`subway` varchar(80) DEFAULT NULL,
`community` varchar(80) DEFAULT NULL,
`location` varchar(80) DEFAULT NULL,
`agent_name` varchar(80) DEFAULT NULL,
`agent_id` varchar(80) DEFAULT NULL,
PRIMARY KEY (`id`),
KEY `id` (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=40 DEFAULT CHARSET=utf8;
爬虫程序如下:
import time
import pymysql
import requests
from bs4 import BeautifulSoup
# 获取url下的页面内容,返回soup对象
def get_page(url):
response = requests.get(url)
soup = BeautifulSoup(response.text,'html5lib')
return soup
# 将以上的代码封装成一个获取链接的方法函数,作用是获取列表页下面的所有租房页面的链接,返回链接列表
def get_links(link_url):
soup = get_page(link_url)
links_div = soup.find_all('div',class_="pic-panel")
links = [div.a.get('href') for div in links_div]
return links
def get_house_info(house_url):
soup = get_page(house_url)
price = soup.find('span', class_='total').text
unit = soup.find('span', class_='unit').text.strip()
house_info = soup.find_all('p')
area = house_info[0].text[3:]
layout = house_info[1].text[5:]
floor = house_info[2].text[3:]
direction = house_info[3].text[5:]
subway = house_info[4].text[3:]
community = house_info[5].text[3:]
location = house_info[6].text[3:]
create_time = house_info[7].text[3:]
agent = soup.find('a',class_ = 'name LOGCLICK')
agent_name = agent.text
agent_id = agent.get('data-el')
evaluate = soup.find('div',class_='evaluate')
score, number = evaluate.find('span', class_ = 'rate').text.split('/')
times = evaluate.find('span',class_ = 'time').text[5:-1]
info = {
'价格': price,
'单位': unit,
'面积': area,
'户型': layout,
'楼层': floor,
'朝向': direction,
'发布时间': create_time,
'地铁': subway,
'小区': community,
'位置': location,
'经纪人姓名': agent_name,
'经纪人ID': agent_id
}
return info
DATABASE = {
'host': 'localhost', #如果是远程数据库,此处为远程服务器的ip地址
'database': 'house',
'user' : 'root',
'password': 'toor',
# 字符集编码,防止数据乱码
'charset' : 'utf8'
}
def get_db(setting):
return pymysql.connect(**setting)
def insert(db,house):
values = "'{}',"* 10 + "'{}'"
sql_values = values.format(house['价格'],house['单位'],house['面积'],house['户型'],
house['楼层'],house['朝向'],house['地铁'],house['小区'],
house['位置'],house['经纪人姓名'],house['经纪人ID'])
sql = """
insert into db_house(`price`,`unit`,`area`,`layout`,`floor`,`direction`,`subway`,`community`,`location`,`agent_name`,`agent_id`)
values({})
""".format(sql_values)
print(sql)
cursor = db.cursor()
cursor.execute(sql)
db.commit()
db = get_db(DATABASE)
links = get_links('http://bj.lianjia.com/zufang/')
for link in links:
time.sleep(2)
print('获取一个房子信息成功!')
house = get_house_info(link)
print(house,end='\r')
insert(db,house)
打开数据库,可以看到租房信息已经存储到mysql数据库。