Source code for src.orca_out_to_json

#!/usr/bin/env python3
"""
A script to quickly pull desired data from an ORCA .out file and compile
it into a JSON file.

Before running, the user should specify what information they want to look for
in a .txt file (see example). When executed, the script checks each file in the
working directory. If the file ends in .out, it exports the desired data into
a JSON file.
"""
__author__ = "Peter Waddell"
__copyright__ = "Copyright 2024"
__credits__ = ["Peter Waddell"]
__version__ = "0.1.0"
__date__ = "2024/03/01"
__maintainer__ = "Peter Waddell"
__email__ = "pmwaddell9@gmail.com"
__status__ = "Prototype"

import os
import json
import sys

from orca_data_extraction.src.structure_data_builder import StructureDataBuilder



[docs]
def make_json_list(sd_list):
    """
    Converts the data in a list of StructureData instances to a dict for JSON.

    Parameters
    ----------
    sd_list : list
        List containing the set of StructureData instances that each come
        from the ORCA .out files.

    Returns
    -------
    list
        List with the data from sd_list configured to be compatible with
        JSON (i.e., tuples are converted to strings).
    """
    def format_column_name(x):
        """
        Format string for use as a column name in the JSON file.

        Parameters
        ----------
        x : str or other
            Entity to be potentially renamed.

        Returns
        -------
        str or other
            A string formatted for use as column name in the JSON file, or returns
            the object unchanged if it is not a string (e.g., a dict).
        """
        if type(x) != str:
            return x
        else:
            return x\
                .replace("'", "")\
                .replace(', ', ',')\
                .replace(' ', '_')\
                .lower()

    json_lst = []
    for sd in sd_list:
        sd_data = {
            'script_input_filename': sd.get_input_filename(),
            'orca_out_filename': sd.get_out_filename()
        }
        for data_section in sd.get_data_sections().values():
            data_section_data = data_section.get_data()
            json_safe_data = {}
            # JSON is not compatible with tuples, so must convert to str
            for key in data_section_data.keys():
                val = data_section_data[key]
                key_to_add, val_to_add = key, val
                if type(val) == tuple:
                    val_to_add = str(val)
                if type(key) == tuple:
                    key_to_add = str(key)
                json_safe_data[format_column_name(key_to_add)] = \
                    format_column_name(val_to_add)
            sd_data[
                data_section.get_section_name().replace(' ', '_').lower()
            ] = json_safe_data
        json_lst.append(sd_data)
    return json_lst




[docs]
def create_json_from_sds(sd_list, json_name):
    """
    Writes the data in a list of StructureData instances to a JSON file.

    Parameters
    ----------
    sd_list : list
        List containing the set of StructureData instances that each come
        from the ORCA .out files.
    json_name : str
        Name of the JSON file where the data will be stored.
    """
    json_list = make_json_list(sd_list)
    with open(f'{json_name}.json', 'w') as f:
        json.dump(json_list, f, indent=2)




[docs]
def main():
    # TODO: use an argument parser here instead? make argument inputs more sophisticated?
    json_name = ''
    # Process command line arguments
    # TODO: extract this part for each file type?? let the user select the file type @ command line?
    if len(sys.argv) >= 2:
        inputs_name = sys.argv[1]
        if not os.path.isfile(inputs_name):
            print('No file with name ' + inputs_name + ' found.')
            quit()
        if len(sys.argv) >= 3:
            json_name = sys.argv[2]
    else:
        print('Script will execute on all .out files in the current '
              'working directory.')
        while True:
            print('Name of input file with atom labels ("q" to quit): ',
                  end='')
            inputs_name = input()
            if inputs_name == 'q':
                quit()
            if not os.path.isfile(os.getcwd() + "\\" + inputs_name):
                print('No file with name ' + inputs_name + ' found.')
                continue
            break

    # Ask for excel file name
    if json_name == '':
        print('Name of the JSON file which will contain the data (press ENTER '
              'to use the default name, "q" to quit): ', end='')
        json_name = input()
        if json_name == 'q':
            quit()
        # If the user just hits enter, use default name:
        if json_name == '':
            json_name = f'ORCA_data_{inputs_name[:-4]}'

    print('')
    sd_list = []
    structure_data_builder = StructureDataBuilder(inputs_name)
    for f in os.listdir(os.getcwd()):
        if os.path.isfile(f):
            try:
                filename_end = f[-4:]
            except IndexError:
                # Since it is hard for filenames to be shorter than 4 chars
                # I think it is unlikely this error would ever be raised...
                print(f'{f}: Invalid filename')
                continue
            if filename_end == '.out':
                try:
                    print(f'Beginning search: {f}')
                    sd_list.append(structure_data_builder.build(f))
                    print(f'{f} complete.\n')
                except IndexError:
                    print(f'Something went wrong with {f} and it threw '
                          f'an IndexError...\n')

    create_json_from_sds(sd_list, json_name)
    print(f'Process complete! Results saved as "{json_name}.json"')



if __name__ == '__main__':
    main()
Source code for src.orca_out_to_json

orca-data-extraction

Navigation

Related Topics