目标:
爬取国内油价数据,并存入数据库,存入数据库的表名以 oil+当天日期命名。
分析过程
1.chrome浏览器输入网址,查看网页的源代码
2.观察所需爬取的数据在源代码中的位置,发现所需数据被包围在table内
1以<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">开头 2以</table>结尾
如下图
3.搜索<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">,发现只有一处匹配,符合爬取条件
4.获取到table内的数据后,再获取table内每个<tr></tr>内的数据
5.通过循环获取<tr></tr>内每个<td></td>内的数据,保存为一个列表
6.最后存入数据库即可
方法说明:
1commom.mysql_common.py:使用python操作mysql数据库的方法封装; 2common.table_name.py:命名数据库表名; 3push_data_to_mysql.py:获取网站数据,存入数据库
全部代码
commom.mysql_common.py
1# -*- coding: utf-8 -*- 2from pymysql import * 3import pymysql 4 5class Mysql: 6 def __init__(self, host, port, user, password, db, charset='utf8'): 7 self.host = host 8 self.port = port 9 self.db = db 10 self.user = user 11 self.password = password 12 self.charset = charset 13 14 def connectsql(self): 15 self.conn = connect(host=self.host, 16 port=self.port, 17 user=self.user, 18 passwd=self.password, 19 db=self.db, 20 charset=self.charset) 21 self.cursor = self.conn.cursor(pymysql.cursors.DictCursor) 22 23 def closesql(self): 24 self.cursor.close() 25 self.conn.close() 26 27 def execute(self, sql): 28 try: 29 self.connectsql() 30 self.cursor.execute(sql) 31 data = self.cursor.fetchall() 32 print(data) 33 self.conn.commit() 34 35 self.closesql() 36 37 except Exception as e: 38 print(e)
common.table_name.py
1# -*- coding: utf-8 -*- 2import time 3import re 4 5# 表名称 6def table_name(): 7 date_time = time.strftime("%Y-%m-%d", time.localtime()) 8 data = re.split(r'-', date_time) 9 name = "oil" + data[0] + data[1] + data[2] 10 return name
push_data_to_mysql.py
1get_html_text:获取网站的文本数据 2parse_data:对html数据进行解析,获取到所需数据存入一个列表并返回 3create:mysql创建一个表格,用来存入数据(表格名为 oil+当天年月日) 4insert:将数据插入创建的表格
1# -*- coding: utf-8 -*- 2import requests 3import re 4from common.mysql_common import Mysql 5from common.table_name import table_name 6 7 8class PushOilDataToMysql: 9 def __init__(self): 10 self.html = self.get_html_text() 11 self.oil_data = self.parse_data() 12 self.name = table_name() 13 self.do_mysql = Mysql('localhost', 3306, 'root', 'quanguo123..', 'myoildata') 14 15 @staticmethod 16 def get_html_text(): 17 url = 'http://youjia.chemcp.com/' 18 try: 19 r = requests.get(url, timeout=30) 20 r.raise_for_status() 21 r.encoding = r.apparent_encoding 22 return r.text 23 except: 24 return "" 25 26 def parse_data(self): 27 oil_data = [] 28 try: 29 # 获取<table开头到</table>结尾的数据 30 # 匹配规则, re.S 是因为“.”的作用是匹配除“\n”以外的任何字符, 而在html中有很多"\n" 31 rule_table = '<table width="100%" border="0" cellpadding="4" cellspacing="1" bgcolor="#B6CCE4">\r\n(.*?)</table>' 32 html_data = re.findall(rule_table, self.html, re.S) 33 34 # 获取<tr>开头到</tr>结尾的数据,所有城市的内容 35 # 匹配规则 36 rule_tr = '<tr>\r\n(.*?)</tr>' 37 citys_data = re.findall(rule_tr, html_data[0], re.S) # data_list 为匹配的城市数据 38 39 for i in range(1, len(citys_data)): 40 city_data = citys_data[i] 41 # 匹配规则 42 rule_city_detail = '<td bgcolor="#FFFFFF">(.*?)</td>' 43 city_detail_data = re.findall(rule_city_detail, city_data, re.S) 44 area = city_detail_data[0].split('>')[1].split("<")[0] 45 oil89 = eval(city_detail_data[1]) 46 oil92 = eval(city_detail_data[2]) 47 oil95 = eval(city_detail_data[3]) 48 oil98 = eval(city_detail_data[4]) 49 oil0 = eval(city_detail_data[5]) 50 update_time = city_detail_data[6] 51 oil_data.append((area, oil89, oil92, oil95, oil98, oil0, update_time)) 52 except: 53 print("获取油价数据失败") 54 return oil_data 55 56 def create(self): 57 sql_create_table = 'create table `%s`(id int auto_increment,`area` varchar(20),`oil89` varchar(20),`oil92` varchar(20),`oil95` varchar(20),`oil98` varchar(20), `oil0` varchar(20), `update_time` varchar(255), primary key(id))' % self.name 58 self.do_mysql.execute(sql_create_table) 59 60 def insert(self): 61 for i in range(len(self.oil_data)): 62 sql_insert = 'insert into %s(area, oil89, oil92, oil95, oil98, oil0, update_time) value%s' % (self.name, self.oil_data[i]) 63 self.do_mysql.execute(sql_insert) 64 65 66if __name__ == '__main__': 67 test = PushOilDataToMysql() 68 test.create() 69 test.insert()