Source code for emle._orca_parser

######################################################################
# EMLE-Engine: https://github.com/chemle/emle-engine
#
# Copyright: 2023-2025
#
# Authors: Lester Hedges   <lester.hedges@gmail.com>
#          Kirill Zinovjev <kzinovjev@gmail.com>
#
# EMLE-Engine is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 2 of the License, or
# (at your option) any later version.
#
# EMLE-Engine is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with EMLE-Engine If not, see <http://www.gnu.org/licenses/>.
######################################################################

"""
Parser for ORCA and HORTON outputs.
"""

__all__ = ["ORCAParser"]

import ase as _ase
import h5py as _h5py
import os as _os
import numpy as _np
import tarfile as _tarfile

from ._utils import pad_to_max

_HARTREE_TO_KCALMOL = 627.509


[docs] class ORCAParser: """ Parses ORCA gas phase calculations and corresponding HORTON outputs. Optionally, extract molecular dipolar polarizability (needed for EMLE training) and does energy decomposition analysis for embedding energy. The latter requires additional calculation in presence of point charges and electrostatic potential of the gas phase system at the positions of the point charges). """ _HORTON_KEYS = ( "cartesian_multipoles", "core_charges", "valence_charges", "valence_widths", ) def __init__(self, filename, decompose=False, alpha=False): """ Constructor. Parameters ---------- filename : str Tarball with ORCA and HORTON outputs: ".h5": HORTON output. ".vac.orca": ORCA output for gas phase calculation. If alpha=True, must contain molecular dipolar polarizability tensor as well. The following files are required if decompose=True: ".pc.orca": ORCA output for calculation with point charges (PC). ".pc": Charges and positions of the point charges (the ones used for .pc.orca calculation). ".vpot": Output of orca_vpot, electrostatic potential of gas phase system at the positions of the point charges. ".h5": HORTON output. ".vac.orca": ORCA output for gas phase calculation. If alpha=True, must contain molecular dipolar polarizability tensor as well. All files must have numeric names (same number for the same structure). decompose: bool Whether to do energy decomposition analysis alpha: bool Whether to extract molecular dipolar polarizability tensor Notes ----- Once the files are parsed the following attributes are available: z: (N_MOLS, MAX_N_ATOMS) atomic indices. xyz: (N_MOLS, MAX_N_ATOMS, 3) positions of atoms mbis: a dictionary with MBIS properties (s, q_core, q_val, mu) alpha: (N_MOLS, 3, 3) array of molecular dipolar polarizability tensors (alpha=True) E: total embedding energies (decompose=True) E_static: static embedding energies (decompose=True) E_induced: induced embedding energies (decompose=True) """ if not isinstance(filename, str): raise ValueError("filename must be a string") if not _os.path.exists(filename): raise FileNotFoundError(f"{filename} does not exist") if not isinstance(decompose, bool): raise ValueError("decompose must be a boolean") if not isinstance(alpha, bool): raise ValueError("alpha must be a boolean") try: with _tarfile.open(filename, "r") as tar: self._tar = tar self.names = self._get_names(tar) self.mbis = self._parse_horton() self.z, self.xyz = self._get_z_xyz() if decompose: self.vac_E, self.pc_E = self._get_E() self.E = self.pc_E - self.vac_E self.E_static = self._get_E_static() self.E_induced = self.E - self.E_static if alpha: self.alpha = self._get_alpha() except Exception as e: raise ValueError(f"Failed to parse {filename}: {e}") del self._tar @staticmethod def _get_names(tar): return sorted( [int(name.split(".")[0]) for name in tar.getnames() if name.endswith("h5")] ) def _get_E(self): vac_E = [ self._get_E_from_out(self._get_file(name, "vac.orca")) for name in self.names ] pc_E = [ self._get_E_from_out(self._get_file(name, "pc.orca")) for name in self.names ] return _np.array(vac_E), _np.array(pc_E) def _get_E_from_out(self, f): E_prefix = b"FINAL SINGLE POINT ENERGY" E_line = next(line for line in f if line.startswith(E_prefix)) return float(E_line.split()[-1]) * _HARTREE_TO_KCALMOL def _get_E_static(self): vpot_all = self._get_vpot() pc_all = self._get_pc() result = _np.array([(vpot @ pc) for vpot, pc in zip(vpot_all, pc_all)]) return result * _HARTREE_TO_KCALMOL def _get_vpot(self): return [ self._get_vpot_from_file(self._get_file(name, "vpot")) for name in self.names ] @staticmethod def _get_vpot_from_file(f): return _np.loadtxt(f, skiprows=1)[:, 3] def _get_pc(self): return [ self._get_pc_from_file(self._get_file(name, "pc")) for name in self.names ] @staticmethod def _get_pc_from_file(f): return _np.loadtxt(f, skiprows=1)[:, 0] def _get_alpha(self): alpha = [ self._get_alpha_from_out(self._get_file(name, "vac.orca")) for name in self.names ] return _np.array(alpha) @staticmethod def _get_alpha_from_out(f): while next(f) != b"THE POLARIZABILITY TENSOR\n": pass for i in range(3): next(f) return _np.array([list(map(float, next(f).split())) for _ in range(3)]) def _get_z_xyz(self): mol_data = [ self._get_z_xyz_from_out(self._get_file(name, "vac.orca")) for name in self.names ] z, xyz = zip(*mol_data) return pad_to_max(z, 0), pad_to_max(xyz) @staticmethod def _get_z_xyz_from_out(f): while next(f) != b"CARTESIAN COORDINATES (ANGSTROEM)\n": pass next(f) z, xyz = [], [] try: while True: raw_atom_z, *raw_atom_xyz = next(f).split() z.append(raw_atom_z.decode()) xyz.append([float(x) for x in raw_atom_xyz]) except ValueError: pass return _np.array(_ase.symbols.symbols2numbers(z)), _np.array(xyz) def _parse_horton(self): data = [ self._parse_horton_out(self._get_file(name, "h5")) for name in self.names ] if len(data) == 0: raise ValueError return {k: pad_to_max([_[k] for _ in data]) for k in data[0].keys()} def _parse_horton_out(self, f): h5f = _h5py.File(f) data = {key: h5f[key][:] for key in self._HORTON_KEYS} q = data["core_charges"] + data["valence_charges"] q_total = _np.sum(q) q_shift = (_np.round(q_total) - q_total) / len(q) return { "s": data["valence_widths"], "q_core": data["core_charges"], "q_val": data["valence_charges"] + q_shift, "mu": data["cartesian_multipoles"][:, 1:4], } def _get_file(self, name, suffix): return self._tar.extractfile(f"{name}.{suffix}")