用python爬取网页数据并存_python 网页爬取数据并存到数据库中

本文展示了如何使用Python从指定URL抓取XML数据,解析XML文件,并将解析得到的数据插入到MySQL数据库的DATARETEST表中。程序首先连接到数据库,然后循环遍历XML文件中的Pointer元素,提取相关信息,处理缺失值,并将其存储到数据库中。
摘要由CSDN通过智能技术生成

# -*- coding: utf-8 -*-

import urllib

import xml.dom.minidom

import MySQLdb

import datetime

TimePoint = datetime.datetime.now()

#mysql

db=MySQLdb.Connect(host="localhost",user="root",passwd="root",db="pythondb",charset="utf8")

cursor = db.cursor()

urlPrefix = 'http://121.28.49.85:8080/datas/hour/'

regiondIds = ['130000']

for regiondId in regiondIds:

fullUrl = urlPrefix + regiondId + '.xml'

data=urllib.urlopen(fullUrl).read()

dom = xml.dom.minidom.parseString(data)

root = dom.documentElement

pointers = root.getElementsByTagName("Pointer")

#print len(pointers)

'''

INSERT INTO DATARETEST(Province, City, Area,

PositionName, StationCode, Latitude, Longitude, AQI, Quality,

CO, NO2, O3, O3_8h, PM10, PM2_5, PM2_5_24h, SO2)

VALUES ('河北', '石家庄', '发觉县', '观测站', 'a001',

212, 212, 21, '有', 12, 21, 21, 21, 21, 21, 21, 12)

'''

for pointer in pointers:

Province = '河北省'

City = pointer.getElementsByTagName('City')[0].firstChild.nodeValue.encode("utf-8")

Area = pointer.getElementsByTagName('Region')[0].firstChild.nodeValue.encode("utf-8")

PositionName = pointer.getElementsByTagName('Name')[0].firstChild.nodeValue.encode("utf-8")

StationCode = pointer.getElementsByTagName('Color')[0].firstChild.nodeValue.encode("utf-8")

Latitude = pointer.getElementsByTagName('CLat')[0].firstChild.nodeValue.encode("utf-8")

Longitude = pointer.getElementsByTagName('CLng')[0].firstChild.nodeValue.encode("utf-8")

AQI = pointer.getElementsByTagName('AQI')[0].firstChild.nodeValue.encode("utf-8")

Quality = pointer.getElementsByTagName('Level')[0].firstChild.nodeValue.encode("utf-8")

polls = pointer.getElementsByTagName('Poll')

for poll in polls:

CO = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

NO2 = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

O3 = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

PM10 = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

O3_8h = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

PM2_5 = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

PM2_5_24h = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

SO2 = pointer.getElementsByTagName('Value')[0].firstChild.nodeValue

if CO == '--':

CO = 0

if NO2 == '--':

NO2 = 0

if O3 == '--':

O3 = 0

if PM10 == '--':

PM10 = 0

if O3_8h == '--':

O3_8h = 0

if PM2_5 == '--':

PM2_5 = 0

if PM2_5_24h == '--':

PM2_5_24h = 0

if SO2 == '--':

SO2 = 0

cursor.execute('INSERT INTO DATARETEST(Province, City, Area, \

PositionName, StationCode, Latitude, Longitude, AQI, Quality,\

CO, NO2, O3, O3_8h, PM10, PM2_5, PM2_5_24h, SO2, TimePoint)\

VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)', \

(Province, City, Area, PositionName, StationCode, Latitude, Longitude, AQI, Quality,

CO, NO2, O3, O3_8h, PM10, PM2_5, PM2_5_24h, SO2, TimePoint))

db.commit()

db.close()

'''

INSERT INTO DATARETEST(Province, City, Area,

PositionName, StationCode, Latitude, Longitude, AQI, Quality,

CO, NO2, O3, O3_8h, PM10, PM2_5, PM2_5_24h, SO2)

VALUES ('河北', '石家庄', '发觉县', '观测站', 'a001',

212, 212, 21, '有', 12, 21, 21, 21, 21, '32', 21, 12)

'''

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值