xpath是比正则表达式要慢,但是使用起来会更加简单的一种方法。
xpath更加注重结构化数据,要一层一层的去寻找,直到获取到想要的唯一的元素。使用xpath的方法可以见这篇文章
如何使用xpath进行数据的提取
要注意在提取之前,要将服务器返回的信息进行转换处理,将其转换为xml对象。
在下面这个案例中主要用了lxml与pymysql模块,将提取的信息保存到数据库中。
import requests, json, random,pymysql
from lxml import etree
headers = {
'X-Requested-With': 'XMLHttpRequest',
'Cookie': 'xxx',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:67.0) Gecko/20100101 Firefox/67.0'
}
#创建数据库对象
db=pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='123',
db='youxin',
)
#获取游标
cursor = db.cursor()
def get_city_name():
base_url = 'https://www.xin.com/apis/Ajax_common/get_home_city/?'
response = requests.get(base_url, headers=headers)
contents = response.content.decode('utf-8')
#json解析,获取数据
data = json.loads(contents)
# print(data)
city_msg_dict = data['data']['city_all']
# print(city_msg_dict)
city_obj_list = []
for city_code, city_msg in city_msg_dict.items():
city_obj_msg = {}
city_name = city_msg['cityname']
city_ename = city_msg['ename']
city_obj_msg['城市名'] = city_name
city_obj_msg['城市简名'] = city_ename
city_obj_list.append(city_obj_msg)
return city_obj_list
def get_car_msg():
#接收get_city_name返回的列表
city_list = get_city_name()
#如果存在xincar表,则删除,不存在不会报错
s = 'drop table if not exists xincar'
#执行上述语句
cursor.execute(s)
#创建表的语句
sql = 'create table if not exists xincar(id primary key auto_increment,title varchar(100),price varchar(100),run_miles varchar(100),cangku varchar(100),live_year varchar(32),image_src varchar(100),verify varchar(200),first_pay varchar(100))'
#执行创建表格的语句
cursor.execute(sql)
for i in city_list:
city = i['城市简名']
base_url = 'https://www.xin.com/{}/s/'.format(city)
for page in range(1, 21):
# 标题,价格,公里数,仓库,年限,图片,认证,首付信息
response = requests.get(base_url, headers=headers)
contents = response.content.decode('utf-8')
contents = etree.HTML(contents)
car_obj_list = contents.xpath('.//div[@class="across"]')
for car_list in car_obj_list:
title = car_list.xpath('.//div/h2/span/text()')[0].strip()
price = car_list.xpath('.//p[@class=""]/em/text()')
verify = car_list.xpath('.//p[@class=""]/span/text()')
if not price:
price = car_list.xpath('.//p[@class=" change-mt "]/em/text()')
verify = car_list.xpath('.//p[@class=" change-mt "]/span/text()')
price = price[0].strip().replace('\n', '')
verify = verify[0].strip()
run_miles = car_list.xpath('.//div[@class="pad"]/span/text()')[1].strip()
cangku = car_list.xpath('.//div[@class="pad"]/span/span/text()')[0].strip()
live_year = car_list.xpath('.//div[@class="pad"]/span/text()')[0].strip()
image = car_list.xpath('.//a[@class="aimg"]/img/@src')[0].strip()
first_pay = car_list.xpath('.//span[@class="pay-price"]/text()')
if first_pay:
# print(first_pay[0].rstrip())
first_pay = first_pay[0].rstrip()
else:
first_pay='无信息'
"""将信息插入到数据库中"""
#注意,使用format时,要在{}外加上引号""以表示为字符串
sqll = 'insert into xincar(title,price,run_miles,cangku,live_year,image_src,verify,first_pay) values("{}","{}","{}","{}","{}","{}","{}","{}")'.format(title,price,run_miles,cangku,live_year,image,verify,first_pay)
cursor.execute(sqll)
#尤其注意要将数据进行提交,且不是游标进行的提交,而是数据库对象进行的提交
db.commit()
#在循环结束后要关闭游标以及数据库对象,以保证安全
cursor.close()
db.close()
if __name__ == '__main__':
get_car_msg()
# get_city_name()
如果在放入数据库时出现下列错误:
pymysql.err.ProgrammingError: (1064, "You have an error in your SQL syntax; check the
manual that corresponds to your MySQL server version for the right syntax to use
near '舍曲林用于治疗抑郁症的相关症状,包括伴随焦虑、有或无' at line 1")
可以在要放入数据库的变量前加上
name = pymysql.escape_string(name)
语句,以避免此类错误的发生。