Source code for src.mulliken_charges

#!/usr/bin/env python3
"""
The MullikenCharges class searches for and stores Mulliken charge data from
an ORCA .out file.
"""
__author__ = "Peter Waddell"
__copyright__ = "Copyright 2024"
__credits__ = ["Peter Waddell"]
__version__ = "0.1.0"
__date__ = "2024/02/27"
__maintainer__ = "Peter Waddell"
__email__ = "pmwaddell9@gmail.com"
__status__ = "Prototype"

import re

from orca_data_extraction.src.data_section_with_inputs import DataSectionWithInputs



[docs]
class MullikenCharges(DataSectionWithInputs):
    """
    Finds and stores Mulliken charge data from an ORCA .out file.

    Methods
    -------
    _search
        Search .out file for Mulliken charge data.
    """
    def __init__(self, out_filename, outfile_contents, inputs):
        """
        Parameters
        ----------
        out_filename : str
            Name of the ORCA .out file that will be searched.
        outfile_contents : str
            String containing the full text of the ORCA .out file.
        inputs : tuple
            Tuple of atom labels (e.g. ('2 H') for which Mulliken charge data
            will be searched.
        """
        super().__init__(out_filename, outfile_contents, inputs)
        self._section_name = 'Mulliken Charges'

    def _search(self, atom_label):
        """
        Use regex to search .out file for an atom's Mulliken charge data.

        Parameters
        ----------
        atom_label : str
            String of the desired atom label.

        Returns
        -------
        str
            A string of a number corresponding to the Mulliken charge
            of the atom.

        Raises
        ------
        AttributeError
            This occurs when the regex fails to find what it is looking
            for, and returns NoneType. Then, .group(n) gives this error.
        """
        def __convert_str_for_verbose_regex(s):
            """
            Converts string to a form that works properly for verbose REs.

            Verbose regular expressions ignore whitespace, unless preceded by a
            "\" (backslash) character. To use such a string as part of a
            verbose RE, this character must be added before each space first;
            this function returns a version of the input string modified
            in this way.

            Parameters
            ----------
            s: str
                Input string.

            Returns
            -------
            result: str
                A modified version of the input string which now has "\"
                preceding each whitespace character.
            """
            result = ''
            for i in range(len(s)):
                if s[i] == ' ':
                    result = result + '\\ '
                else:
                    result = result + s[i]
            return result

        def __reverse_string_by_lines(s):
            """
            Reverses a string in terms of the order of its lines.

            Parameters
            ----------
            s: str
                String to be reversed line-by-line

            Returns
            -------
            str
                Reversed string, line-by-line.
            """
            s = s.splitlines()[::-1]
            return '\n'.join(s)

        re_atom_label = __convert_str_for_verbose_regex(atom_label)

        # Here I am assuming that the last occurrence of geometry data in the
        # .out file will represent the finished calculation, seems logical...

        # However, I found that in order to find the last occurrence without
        # catastrophic backtracking, it was necessary to reverse the line order
        # of the outfile contents string, then match the first occurrence there.
        # TODO: consider moving these into the constructor, as an attribute?
        # TODO: but given the current structure this appears to be annoying...
        reversed_contents = \
            __reverse_string_by_lines(self._outfile_contents)

        # Note that, consequently, this regex string is INVERTED to match!!!
        # Note also: \ must be used for all whitespace I want to count
        # when using verbose regular expressions.
        mulliken_charge_regex = re.compile(
            fr"""
            (MULLIKEN\ REDUCED\ ORBITAL\ CHARGES)
            (.*?)
            ((\ |\n){re_atom_label}\ :)
            (\ *)               # whitespace
            (-?[\d]+[.][\d]+)   # Mulliken charge
            (.*?)
            (MULLIKEN\ ATOMIC\ CHARGES) 
            # above: prevents accidental matches later in the .out file
            """
            , flags=re.VERBOSE | re.DOTALL)
        try:
            result = mulliken_charge_regex.search(reversed_contents)
            return result.group(6)
        except AttributeError:
            print(f'Error: {atom_label} was not found '
                  f'in {self._out_filename} (Mulliken Charges).')
            return None
Source code for src.mulliken_charges

orca-data-extraction

Navigation

Related Topics