Source code for mast_contributor_tools.filename_check.fc_db

"""Create and manage an SQLite database for storing results of file checking."""

import sqlite3

import astropy.io.fits as fits
import pandas as pd
from astropy.table import Table

# The following SQL will create am SQLite database
FILENAME_TABLE = """
        CREATE TABLE IF NOT EXISTS filename (
	    path	TEXT NOT NULL DEFAULT '.',
	    filename	TEXT NOT NULL UNIQUE,
	    final_verdict	TEXT CHECK("final_verdict" IN ('PASS', 'FAIL', 'NEEDS REVIEW')),
	    n_elements	INTEGER
        );
        """
FIELDS_TABLE = """
        CREATE TABLE IF NOT EXISTS fields (
        file_ref  TEXT NOT NULL,
	    name  TEXT NOT NULL,
        value TEXT NOT NULL,
        nfield INTEGER,
	    capitalization_score  TEXT NOT NULL DEFAULT 'fail' CHECK("capitalization_score" IN ('pass', 'fail')),
	    length_score  TEXT NOT NULL DEFAULT 'fail' CHECK("length_score" IN ('pass', 'fail')),
	    format_score  TEXT NOT NULL DEFAULT 'fail' CHECK("length_score" IN ('pass', 'fail')),
	    value_score  TEXT NOT NULL DEFAULT 'fail' CHECK("value_score" IN ('pass', 'fail', 'needs review')),
        nfield_score  TEXT NOT NULL DEFAULT 'fail' CHECK("value_score" IN ('pass', 'fail', 'needs review')),
	    field_verdict  TEXT NOT NULL DEFAULT 'FAIL' CHECK("field_verdict" IN ('PASS', 'FAIL', 'NEEDS REVIEW')),
	    FOREIGN KEY(file_ref) REFERENCES filename_db(filename)
        );
        """
PROBLEMS_VIEW = """
        CREATE VIEW IF NOT EXISTS potential_problems as
        select fn.path, fn.filename, fn.n_elements, fl.name, fl.value, fl.nfield, fl.capitalization_score, fl.length_score,
        fl.format_score, fl.value_score, fl.nfield_score, fl.field_verdict
        from filename as fn, fields as fl
        where fn.filename = fl.file_ref
        AND fl.field_verdict != 'PASS';
        """

INSERT_FILE_RECORD = """INSERT INTO filename VALUES(:path,:filename,:final_verdict,:n_elements)"""
INSERT_FIELD_RECORD = """INSERT INTO fields VALUES(:file_ref,:name,:value,:nfield,:capitalization_score,:length_score,:format_score,:value_score,:nfield_score,:field_verdict)"""


[docs] class Hlsp_SQLiteDb: """Create an SQLite DB to store results. Parameters ---------- filename : str name of the SQLite DB file to be created """ def __init__( self, filename: str, ) -> None: self.db_file = filename
[docs] def create_db(self) -> None: """Create the database and construct the tables. Raises ------ sqlite3.Error : Error Raised if the DB cannot be created or the tables fail to be created. """ try: self.conn = sqlite3.connect(self.db_file) for statement in [FILENAME_TABLE, FIELDS_TABLE, PROBLEMS_VIEW]: self.conn.execute(statement) # Turn on Write-Ahead Log # See https://www.powersync.com/blog/sqlite-optimizations-for-ultra-high-performance self.conn.execute("PRAGMA journal_mode = WAL") self.conn.execute("PRAGMA synchronous = normal") self.conn.execute("PRAGMA journal_size_limit = 6144000") self.conn.commit() except sqlite3.Error as e: print(e)
[docs] def close_db(self) -> None: self.conn.close()
[docs] def add_filename(self, file_record: dict) -> None: """Add file metadata to the filename table Add file metadata to the DB in the form of a list of key:value pairs. Parameters ---------- file_record : dict File attributes """ self.conn.execute(INSERT_FILE_RECORD, file_record) self.conn.commit()
[docs] def add_fields(self, elements: list[dict]) -> None: """Add metadata for each of a filename's fields to the fields table Add results of checking mmultiple filename fields to the fields table. Metadata for each field take the form of key:value pairs. Parameters ---------- elements : list[dict] List of element attribute dictionaries. """ self.conn.executemany(INSERT_FIELD_RECORD, elements) self.conn.commit()
[docs] def print_summary(self) -> str: """ Returns a string detailing some summary information on how many files have passed validation """ # Get data from db dat = self.conn.execute("SELECT filename, final_verdict from filename").fetchall() # Parse numbers num_files = len(dat) num_pass = sum([1 for d in dat if d[1].upper() == "PASS"]) num_review = sum([1 for d in dat if d[1].upper() == "NEEDS REVIEW"]) num_fail = sum([1 for d in dat if d[1].upper() == "FAIL"]) # Write summary message summary_message = "Output summary:\n " summary_message += f"Files Checked: {num_files}\n " summary_message += f"Files Passed: {num_pass}\n " summary_message += f"Files Need Review: {num_review}\n " summary_message += f"Files Failed: {num_fail}\n " # All files passed if num_pass == num_files: summary_message += "All files passed!" elif num_review > 0: summary_message += "If any fields were marked with a score of 'needs review', please consult with your MAST staff contact. Unrecognized values are very often necessary and good, but require review." # Some files failed elif num_fail > 0: summary_message += f"See results file ({self.db_file}) for more information. Some files did not meet our criteria. Note: only fields with a final_verdict of 'fail' contributed to this result." # Add more detail here later? - could break down by fields, etc. return summary_message
[docs] def write_to_alternate_format(self, save_format: str) -> list[str]: """ Write out the SQLite DB as an alternate format. Parameters ---------- save_format : str Format to save output: 'csv', 'excel', 'html', or 'fits' Returns -------- files_written: list[str]: List of file names written out """ # Check that input is a valid option supported_formats = ["csv", "excel", "fits", "html"] if save_format.lower() not in supported_formats: msg = f"Save Format '{save_format}' is not supported." msg += f"Please choose from {supported_formats}" raise ValueError(msg) # Construct new filename to save to fileroot = self.db_file.strip(".db") # Read DB file as pandas dataframe self.conn = sqlite3.connect(self.db_file) filename_data = pd.read_sql_query("SELECT * FROM filename", self.conn) fields_data = pd.read_sql_query("SELECT * FROM fields", self.conn) # Close connection self.conn.close() # Save out data in new format # CSV files if save_format == "csv": ouput_filename1 = f"{fileroot}_filenames.csv" ouput_filename2 = f"{fileroot}_fields.csv" filename_data.to_csv(ouput_filename1) fields_data.to_csv(ouput_filename2) files_written = [ouput_filename1, ouput_filename2] # Excel spreadsheet elif save_format == "excel": output_filename = f"{fileroot}.xlsx" # Style dataframe: change color of cell depending on verdict filename_data = filename_data.style.map(color_formatter) fields_data = fields_data.style.map(color_formatter) # Save as one Excel document with two sheets: # One for filenames table and one for fields with pd.ExcelWriter(output_filename) as excel_writer: filename_data.to_excel(excel_writer, sheet_name="FileNames") fields_data.to_excel(excel_writer, sheet_name="Fields") files_written = [output_filename] # Fits table elif save_format == "fits": # Save as one fits file with two table extensions: # One for filenames table and one for fields output_filename = f"{fileroot}.fits" hdu_list = fits.HDUList( [ fits.PrimaryHDU(), # TODO: add some metadata? fits.table_to_hdu(Table.from_pandas(filename_data), name="FILENAMES"), fits.table_to_hdu(Table.from_pandas(fields_data), name="FIELDS"), ] ) hdu_list.writeto(output_filename, overwrite=True) files_written = [output_filename] # Html table elif save_format == "html": ouput_filename1 = f"{fileroot}_filenames.html" ouput_filename2 = f"{fileroot}_fields.html" # Style dataframe: change color of cell depending on verdict filename_data = filename_data.style.map(color_formatter) fields_data = fields_data.style.map(color_formatter) # Then write to html file filename_data.to_html( ouput_filename1, header=True, ) fields_data.to_html( ouput_filename2, header=True, ) files_written = [ouput_filename1, ouput_filename2] else: # No alternate format - only DB file files_written = [self.db_file] return files_written
[docs] def color_formatter(value: str) -> str: """Color mapping for use in write_to_alternate_format(). Color-codes table cells based on verdict: green for "PASS", red for "FAIL", etc. Parameters ------ value: str """ if str(value).upper() == "PASS": color = "lightgreen" elif str(value).upper() == "FAIL": color = "red" elif str(value).upper() == "NEEDS REVIEW": color = "yellow" else: color = None return "background-color: %s" % color