前序文章讲解了yelp dataset导入Neo4j的详细步骤,但实际操作过程中可能会遇到各种问题。
为了避免中间环节遇到的各类问题,选择直接读取json文件,解析需要的字段导入Neo4j。下文附上详细代码。
注意:
1、字段可能重复,所以代码中设置了独立的set用于去除重复的节点、关系导入;
2、可能会遇到特殊字符,比如名字或地址之间包含',"name":"Marco's Pizza",导入的时候就需要设置为双引号表示字符串。
import json
import csv
import py2neo
from py2neo import Graph,Node,Relationship
from base import openfile
import re
import reverse_geocoder as rg
# connect to local Neo4j
graph=Graph(
"http://localhost:11006/",
username="admin",
password="password"
)
def read_json(path,filename):
f=open(path+filename,'r',encoding='utf-8')
unique_cities = set ( )
unique_categorys =set()
unique_business=set()
unique_users=set()
unique_reviews=set()
unique_business_cities=set()
unique_business_categorys=set()
unique_city_state=set()
unique_state_country=set()
for line in f.readlines():
item=json.loads(line)
if filename =='business.json':
business_id = item['business_id']
name = item['name']
address = item['address']
city = item['city']
if business_id!="" and name !="" and address !="" and business_id not in unique_business:
try :
teststr="merge (Business:Business {business_id:" +'"'+ business_id +'"'+ ", name:" +'"'+ name +'"'+ ", address:" +'"'+ address +'"'+ "})"
graph.run (
"merge (Business:Business {business_id:" +'"'+ business_id +'"'+ ", name:" +'"'+ name +'"'+ ", address:" +'"'+ address +'"'+ "})" )
#there may be many categorys, so each one has a business id
for category in item ["categories"].split ( ',' ) :
unique_categorys.add ( category )
unique_business_categorys.add(business_id+","+category)
unique_business.add ( business_id )
except :
print ( "business写入Neo4j报错" )
print(teststr)
lat_longs = {}
result={}
if item ["latitude"] and item ["longitude"] :
lat_longs [item ["business_id"]] = {
"lat_long" : (item ["latitude"], item ["longitude"])
}
business_ids = list ( lat_longs.keys ( ) )
for value in lat_longs.values ( ) :
locations = rg.search ( value ["lat_long"] )
for business_id, location in zip ( business_ids, locations ) :
try:
if city not in unique_cities and business_id in unique_business:
graph.run (
"merge (City:City {city:" +'"'+ city +'"'+ "})" )
unique_cities.add(city)
# if location["admin1"] not in unique_states:
graph.run (
"merge (State:State {state:" +'"'+ location["admin1"] +'"'+ "})" )
# unique_states.add(location["admin1"])
# if location["cc"] not in unique_countrys:
graph.run (
"merge (Country:Country {country:" +'"'+ location ["cc"] +'"'+ "})" )
# unique_countrys.add(location["cc"])
if city+","+location ["admin1"] not in unique_city_state and business_id in unique_business:
graph.run (
"match (s:State {state:" +'"'+ location [
"admin1"] +'"'+ "}),(c:City {city:" +'"'+ city +'"'+ "})" + "create (c)-[:in_state]->(s)" )
unique_city_state.add(city+","+location ["admin1"])
# teststr="match (c:Country {country:'" + location ["cc"] + "'}),(s:State {state:'" + location ["admin1"] + "'})" + "create (s)-[:in_country]->(c)"
if location ["admin1"]+","+location ["cc"] not in unique_state_country and business_id in unique_business:
graph.run (
"match (c:Country {country:" +'"'+ location ["cc"] +'"'+ "}),(s:State {state:" +'"'+ location ["admin1"] +'"'+ "})" + "create (s)-[:in_country]->(c)" )
unique_state_country.add(location ["admin1"]+","+location ["cc"])
if business_id+city not in unique_business_cities and business_id in unique_business:
graph.run (
"match (b:Business {business_id:" +'"'+ business_id +'"'+ "}),(c:City {city:" +'"'+ city +'"'+ "})" + "create (b)-[:in_city]->(c)" )
unique_business_cities.add(business_id+city)
except:
print ( "location 写入报错" )
continue
elif filename =='user.json':
user_id=item["user_id"]
user_name=item["Rashmi"]
for friends_id in item["friends"].split(','):
if user_id not in unique_users:
try :
graph.run (
"merge (User:User {user_id:" +'"'+ user_id +'"'+ ", name:"+'"' + user_name +'"'+ "})" )
unique_users.add ( user_id )
# create relations for friends should check duplicate
graph.run (
"match (b:User {user_id:"+'"'+ user_id +'"'+ "}),(a:User {user_id:" +'"'+ friends_id +'"'+ "})" + "create (b)-[:friends]->(a)" )
except :
print ( "用户写入报错" )
continue
elif filename == 'review.json' :
review_id=item["review_id"]
user_id=item["user_id"]
business_id=item["business_id"]
text=item["text"]
stars=item["stars"]
date1=item["date"]
try :
if review_id not in unique_reviews:
graph.run (
"merge (Review:Review {review_id:" +'"'+ review_id +'"'+ ", stars:" +'"'+ stars +'"'+",date:"+'"'+ date1 +'"'+", text:" +'"'+ text +'"'+ "})" )
unique_reviews.add(review_id)
# create relations for friends should check duplicate
graph.run (
"match (b:User {user_id:" +'"'+ user_id +'"'+ "}),(a:Review {review_id:" +'"'+ review_id +'"'+ "})" + "create (b)-[:write]->(a)" )
graph.run (
"match (b:Business {business_id:" +'"'+ business_id +'"'+ "}),(a:Review {review:" +'"'+ review_id +'"'+ "})" + "create (a)-[:reviews]->(b)" )
except :
print ( "评论写入报错" )
continue
# create category node and relations between business and category
for category in unique_categorys :
try :
graph.run (
"merge (Category:Category {category:" +'"'+ category +'"'+ "})" )
# graph.run (
# "match (b:Business {business_id:'" + business_id + "'}),(c:Category {category:'" + category + "'})" + "create (b)-[:belong_to]->(c)" )
except :
print ( "分类写入报错" )
continue
for business_category in unique_business_categorys:
try :
graph.run (
"match (b:Business {business_id:" +'"'+ business_category.split(',')[0] +'"'+ "}),(c:Category {category:" +'"'+ business_category.split(',')[1] +'"'+ "})" + "create (b)-[:belong_to]->(c)" )
except :
print ( "business与分类关系写入报错" )
continue
if __name__ == '__main__' :
# read json
list1=['business.json',
'review.json',
'user.json']
path = 'D:/share/yelp/'
for filename in list1:
file=path+filename
read_json(path,filename)