I have a CSV file which is downloaded from database(as it is in CSV) and now I have to parse into JSON Schema. Don't worry this link just github gist
Problem I am facing is its Multi line Header check CSV File Here
If you take notice in the file:
On 1st line of CSV it has 1st line of headers then next line has
all the values for those headers.
On 3rd line of CSV file it has 2nd line of headers then next line
has all the values for those headers.
On 5th line of CSV file it has 3rd line of headers then next line
has all the values for those headers.
Also you can notice the pattern here,
1st line of headers hasn't any tab
2nd line of headers has only one tab
3rd line of headers has two tabs
This goes for all the records.
Now 1st problem is this multi line of headers.
And 2nd problem is how to parse it into nested json as I have.
one of the solution I have tried Create nested JSON from CSV. and noticed the 1st problem with my csv.
My look like this. Where I am only trying to parse initial fields of schema
import csv
import json
def csvParse(csvfile):
# Open the CSV
f = open(csvfile, 'r')
# Change each fieldname to the appropriate field name.
reader = csv.DictReader(f, fieldnames=("Order Ref", "Order
Status", "Affiliate", "Source", "Agent", "Customer Name", "Customer Name", "Email
Address", "Telephone", "Mobile", "Address 1", "Address 2", "City", "County/State",
"Postal Code", "Country", "Voucher Code", " Voucher Amount", "Order Date", "Item ID",
"Type", "Supplier Code", "Supplier Name", "Booking Ref", "Supplier Price", "Currency", "Selling Price", "Currency", "Depart", "Arrive", "Origin",
"Destination", "Carrier", "Flight No", "Class", "Pax Type", "Title",
"Firstname", "Surname", "DOB", "Gender", "FOID Type"))
customer = []
data = []
# data frame names in a list
for row in reader:
frame = {"orderRef": row["Order Ref"],
"orderStatus": row["Order Status"],
"affiliate": row["Affiliate"],
"source": row["Source"],
"customers": []}
data.append(frame)
解决方案
This is not csv parser friendly, because the file contains multiple csv, and at least one contains 2 columns with same name, which prevents from using a DictReader.
I would first build a wrapper that allows to parse each csv fragment and kindly raises a stop iteration when it reaches a different fragment. I would use the re module to find the different headers.
Here is what the code for the wrapper could be:
class multi_csv:
levels = [re.compile('Order Ref,Order Status,Affiliate,Source,Agent,'
'.*,Country,Voucher Code,Voucher Amount,Order Date'),
re.compile('\s*,Item ID,Type,Supplier Code,Supplier Name,'
'.*,Arrive,Origin,Destination,Carrier,Flight No,Class,'),
re.compile('\s*,\s*,Pax Type,Title,Firstname,Surname,DOB,Gender,'
'FOID Type,*')
]
def __init__(self, fd):
self.fd = fd
self.level = 0
self.end = False
def __iter__(self):
return self
def __next__(self):
try:
line = next(self.fd)
except StopIteration:
self.end = True
raise
for lev, rx in enumerate(self.levels):
if rx.match(line):
self.level = lev
raise StopIteration('New level')
return line
It can then be used to build a Python object according to your Json schema:
mc = multi_csv(open(csvfile, 'r')
orders = []
while not mc.end:
rd = csv.reader(mc)
for row in rd:
if mc.level == 0:
items = []
orders.append({
'orderRef': int(row[0]),
'orderStatus': row[1],
'affiliate': row[2],
'source': row[3],
'agent': row[4],
'customer': {
'name': row[5],
'email': row[6],
'telephone': row[7],
'mobile': row[8],
'address': {
'address1': row[9],
'address2': row[10],
'city': row[11],
'county': row[12],
'postCode': row[13],
'country': row[14],
},
},
'voucherCode': row[15],
'voucherAmount': int(row[16]),
'orderDate': row[17],
'items': items,
})
elif mc.level == 1:
if len(row[1].strip()) != 0:
legs = []
passengers = []
items.append({
'itemId': int(row[1]),
'type': row[2],
'supplierCode': row[3],
'supplierName': row[4],
'bookingReference': row[5],
'supplierPrice': row[6],
'supplierPriceCurrency': row[7],
'sellingPrice': row[8],
'sellingPriceCurrency': row[9],
'legs': legs,
'passengers': passengers,
})
legs.append({
'departureDate': row[10],
'arrivalDate': row[11],
'origin': row[12],
'destination': row[13],
'carrier': row[14],
'flightNumber': row[15],
'class': row[16],
})
else: # mc.level == 2
passengers.append({
'passengerType': row[2],
'title': row[3],
'firstName': row[4],
'surName': row[5],
'dob': row[6],
'gender': row[7],
'foidType': row[8],
})
With your file, it gives the expected result:
pprint.pprint(orders)
[{'affiliate': ' ',
'agent': 'akjsd@ad.com',
'customer': {'address': {'address1': ' ',
'address2': ' ',
'city': ' ',
'country': ' ',
'county': ' ',
'postCode': ' '},
'email': 'asd@asd.com',
'mobile': ' ',
'name': 'Mr Kas Iasd',
'telephone': '3342926655'},
'items': [{'bookingReference': 'ABC123',
'itemId': 125,
'legs': [{'arrivalDate': 'ONEWAY',
'carrier': 'PK',
'class': 'Economy',
'departureDate': '12/01/2018 13:15',
'destination': 'LHE',
'flightNumber': '354',
'origin': 'KHI'}],
'passengers': [{'dob': '19/09/1995',
'firstName': 'Aasdsa',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'Mas',
'title': 'Mr'},
{'dob': '07/12/1995',
'firstName': 'Asdad',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'Dasd',
'title': 'Mr'}],
'sellingPrice': '5002',
'sellingPriceCurrency': 'PKR',
'supplierCode': 'SB',
'supplierName': 'Sabre',
'supplierPrice': '5002',
'supplierPriceCurrency': 'PKR',
'type': 'Flight'}],
'orderDate': '11/01/2018 18:51',
'orderRef': 1234,
'orderStatus': 'PayOfflineConfirmedManual',
'source': ' ',
'voucherAmount': 0,
'voucherCode': ' '},
{'affiliate': ' ',
'agent': 'asdss@asda.com',
'customer': {'address': {'address1': ' ',
'address2': ' ',
'city': ' ',
'country': ' ',
'county': ' ',
'postCode': ' '},
'email': 'ads@ads.com',
'mobile': '3332784342',
'name': 'Mr Asdsd Asdsd',
'telephone': '3332324252'},
'items': [{'bookingReference': 'FAILED',
'itemId': 123,
'legs': [{'arrivalDate': '18/01/2018 14:25',
'carrier': 'PA',
'class': 'Economy',
'departureDate': '18/01/2018 11:40',
'destination': 'DXB',
'flightNumber': '210',
'origin': 'KHI'},
{'arrivalDate': '25/01/2018 10:40',
'carrier': 'PA',
'class': 'Economy',
'departureDate': '25/01/2018 6:25',
'destination': 'LHE',
'flightNumber': '211',
'origin': 'DXB'}],
'passengers': [{'dob': '11/08/1991',
'firstName': 'Asd',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'Azam',
'title': 'Mr'},
{'dob': '01/07/1974',
'firstName': 'Aziz',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'Asdsd',
'title': 'Mr'},
{'dob': '28/05/1995',
'firstName': 'mureed',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'ahmed',
'title': 'Mr'},
{'dob': '14/04/2012',
'firstName': 'abdullah',
'foidType': 'None',
'gender': 'Female',
'passengerType': 'Child',
'surName': 'Cdsd',
'title': 'Mr'},
{'dob': '17/12/1999',
'firstName': 'Asdsd',
'foidType': 'None',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'Ahmed',
'title': 'Mr'}],
'sellingPrice': '154340',
'sellingPriceCurrency': 'PKR',
'supplierCode': 'PITCH',
'supplierName': 'Kicker',
'supplierPrice': '154340',
'supplierPriceCurrency': 'PKR',
'type': 'Flight'}],
'orderDate': '11/01/2018 17:06',
'orderRef': 1235,
'orderStatus': 'PayOfflinePendingManualProcessing',
'source': ' ',
'voucherAmount': 100,
'voucherCode': 'ABC123'},
{'affiliate': ' ',
'agent': 'asda@asdad.com',
'customer': {'address': {'address1': ' ',
'address2': ' ',
'city': ' ',
'country': ' ',
'county': ' ',
'postCode': ' '},
'email': 'asd@asdsd.com',
'mobile': '3067869234',
'name': 'Mr Asds Sdsd',
'telephone': '3067869234'},
'items': [{'bookingReference': ' ',
'itemId': 124,
'legs': [{'arrivalDate': 'ONEWAY',
'carrier': 'PK',
'class': 'Economy',
'departureDate': '23/01/2018 2:00',
'destination': 'SHJ',
'flightNumber': '812',
'origin': 'KHI'}],
'passengers': [{'dob': '01/12/1994',
'firstName': 'Asds',
'foidType': 'Passport',
'gender': 'Male',
'passengerType': 'Adult',
'surName': 'raza',
'title': 'Mr'}],
'sellingPrice': '20134',
'sellingPriceCurrency': 'PKR',
'supplierCode': 'PITCH',
'supplierName': 'Kicker',
'supplierPrice': '20134',
'supplierPriceCurrency': 'PKR',
'type': 'Flight'}],
'orderDate': '11/01/2018 16:23',
'orderRef': 1236,
'orderStatus': 'PayOfflinePendingManualProcessing',
'source': ' ',
'voucherAmount': 0,
'voucherCode': ' '}]