A custom Python tool for building ndjson-content packages

A Python tool for generating ndjon from formatted SQL files

- 2 mins read

Series: Python

Description

The artifact_builder is a tool that converts SQL files into a single-packaged ndjson file. It works as a submodule in the targeted repositories – eg. pgqueries – , and generates a package of the SQL files to be indexed and stored in a database.

This project has a very specific domain, and it is not for general-purpose use. The ndjson format was intended for provinding sort of an universal supported type for importing the results into any kind of data store or search-endpoint.

This is a very generic example on how it can be parsed and stored:

package storage

// Initial work on the indexer

import (
	"bytes"
	"encoding/json"
	"fmt"

	"internal/types"
	"internal/utils"

	_ "github.com/mattn/go-sqlite3"
)

type Repos struct {
	Engine string
	URL    string
}

// Index downloads JSON files, processes the data and inserts it into
// a SQLite database. It returns an error if any step fails.
func (s *Storage) index(cfg *types.ConfigFile) error {

	urls := make([]types.Repos, 0, len(cfg.Repos))
	for k, v := range cfg.Repos {
		urls = append(urls, types.Repos{Engine: k, URL: v.URL, Version: v.Version})
	}

	// Fetch and process JSON data for each URL
	for _, url := range urls {
		// Fetch JSON data
		jsonData, err := utils.FetchJSON(url.URL)
		if err != nil {
			return fmt.Errorf("error fetching JSON from %s: %v", url, err)
		}
		// Remove BOM (Byte Order Mark) from the beginning of the JSON file, if it exists
		jsonData = bytes.TrimPrefix(jsonData, []byte("\xef\xbb\xbf"))

		m := make(map[string]jsonArtifactRow)
		if err := json.Unmarshal(jsonData, &m); err != nil {
			return fmt.Errorf("error parsing JSON: %v", err)
		}

		// Insert queries into SQLite database
		for queryTitle, row := range m {
			row := &QueriesRow{
				Id:       utils.GetMD5Hash(url.Engine + row.Title + row.FPath + row.Category), // Hash the query
				Engine:   url.Engine,
				Name:     queryTitle,
				Title:    row.Title,
				Doc:      row.Doc,
				DocPath:  row.DocPath,
				FPath:    row.FPath,
				Category: row.Category,
				Query:    row.Query,
			}
			s.insertRowOnConflict(row)  // Here I do a ON CONFLICT QUERY, not the best approach, but practical 
                                        // this use-case.

		}

	}

	return nil
}

comments powered by Disqus