Import from DB-Pedia

DBPedia exports all the entities as GZipped CSV files. Features:

  • First line contains column names, second, third and forth has meta information, which we'll skip (look at "skipFrom": 1, "skipTo": 3in CSV transformer)
  • The vertex class name is created automatically based on the file name, so we can use the same file against any DBPedia file
  • The Primary Key is the "URI" field, where a UNIQUE index has also been created (refer to "ORIENTDB" loader)
  • The "merge" transformer is used to allow to re-import or update any file without generating duplicates

Configuration

{
  "config": {
    "log": "debug",
    "fileDirectory": "/temp/databases/dbpedia_csv/",
    "fileName": "Person.csv.gz"
  },
  "begin": [
   { "let": { "name": "$filePath",  "value": "$fileDirectory.append( $fileName )"} },
   { "let": { "name": "$className", "value": "$fileName.substring( 0, $fileName.indexOf('.') )"} }
  ],
  "source" : {
    "file": { "path": "$filePath", "lock" : true }
  },
  "extractor" : {
   { "csv": { "separator": ",", "nullValue": "NULL", "skipFrom": 1, "skipTo": 3 } },
  },
  "transformers" : [
   { "merge": { "joinFieldName":"URI", "lookup":"V.URI" } },
   { "vertex": { "class": "$className"} }
  ],
  "loader" : {
    "orientdb": {
      "dbURL": "plocal:/temp/databases/dbpedia",
      "dbUser": "admin",
      "dbPassword": "admin",
      "dbAutoCreate": true,
      "tx": false,
      "batchCommit": 1000,
      "dbType": "graph",
      "indexes": [{"class":"V", "fields":["URI:string"], "type":"UNIQUE" }]
    }
  }
}