"""
Helper functions for parsing defect supercell calculations.
"""
import contextlib
import itertools
import os
import re
import warnings
from collections.abc import Iterable
from copy import deepcopy
from functools import lru_cache, partialmethod
from pathlib import Path
from typing import Any, cast
from xml.etree.ElementTree import Element as XML_Element
import numpy as np
import pandas as pd
from monty.io import reverse_readfile
from monty.serialization import loadfn
from numpy.typing import ArrayLike
from pymatgen.analysis.defects.core import DefectType
from pymatgen.core.periodic_table import Element
from pymatgen.core.structure import Composition, Lattice, PeriodicSite, SiteCollection, Structure
from pymatgen.core.structure_matcher import get_linear_assignment_solution, pbc_shortest_vectors
from pymatgen.electronic_structure.core import Spin
from pymatgen.io.vasp.inputs import POTCAR_STATS_PATH, UnknownPotcarWarning
from pymatgen.io.vasp.outputs import Locpot, Outcar, Procar, Vasprun, _parse_vasp_array
from pymatgen.util.coord import all_distances
from pymatgen.util.typing import PathLike, SpeciesLike
from doped.core import DefectEntry, remove_site_oxi_state
from doped.utils import _warn_parameter_order
@lru_cache(maxsize=1000) # cache POTCAR generation to speed up generation and writing
def _get_potcar_summary_stats() -> dict:
return loadfn(POTCAR_STATS_PATH)
[docs]
def find_archived_fname(fname, raise_error=True):
"""
Find a suitable filename, taking account of possible use of compression
software.
"""
if os.path.exists(fname):
return fname
# Check for archive files
for ext in [".gz", ".xz", ".bz", ".lzma"]:
if os.path.exists(fname + ext):
return fname + ext
if raise_error:
raise FileNotFoundError
return None
# has to be defined as staticmethod to be consistent with usage in pymatgen, alternatively could make
# fake custom class:
[docs]
@staticmethod # type: ignore[misc]
def parse_projected_eigen(
elem: XML_Element, parse_mag: bool = True
) -> tuple[dict[Spin, np.ndarray], np.ndarray | None]:
"""
Parse the projected eigenvalues from a |Vasprun| object (used during
initialisation), but excluding the projected magnetization for efficiency.
Note that following SK's PRs to ``pymatgen`` (#4359, #4360), parsing of
projected eigenvalues adds minimal additional cost to |Vasprun| parsing
(~1-5%), while parsing of projected magnetization can add ~30% cost.
This is a modified version of ``_parse_projected_eigen`` from
|Vasprun|, which allows skipping of projected magnetization parsing in
order to expedite parsing in ``doped``, as well as some small adjustments
to maximise efficiency.
Args:
elem (Element):
The XML element to parse, with projected eigenvalues/magnetization.
parse_mag (bool):
Whether to parse the projected magnetization. Default is ``True``.
Returns:
tuple[dict[Spin, np.ndarray], np.ndarray | None]:
A dictionary of projected eigenvalues for each spin channel
(up/down), and the projected magnetization (if parsed).
"""
root = elem.find("array/set")
assert root is not None # projected eigenvalue array always present when this is called
proj_eigen = {}
sets = root.findall("set")
for s in sets:
spin_match = re.match(r"spin(\d+)", s.attrib["comment"])
assert spin_match is not None
spin = int(spin_match[1])
if spin == 1 or (spin == 2 and len(sets) == 2):
spin_key = Spin.up if spin == 1 else Spin.down
elif parse_mag: # parse projected magnetization
spin_key = spin # {2:"x", 3:"y", 4:"z"}
else:
continue
proj_eigen[spin_key] = np.array(
[[_parse_vasp_array(sss) for sss in ss.findall("set")] for ss in s.findall("set")]
)
if len(proj_eigen) > 2:
# non-collinear magnetism (spin-orbit coupling) enabled, last three "spin channels" are the
# projected magnetization of the orbitals in the x, y, and z Cartesian coordinates:
proj_mag = np.stack([proj_eigen.pop(i) for i in range(2, 5)], axis=-1)
proj_eigen = {Spin.up: proj_eigen[Spin.up]}
else:
proj_mag = None
# here we _could_ round to 3 decimal places (and ensure rounding 0.0005 up to 0.001) to be _mostly_
# consistent with PROCAR values (still not 100% the same as e.g. 0.00047 will be rounded to 0.0005
# in vasprun, but 0.000 in PROCAR), but this is _reducing_ the accuracy so better not to do this,
# and accept that PROCAR results may not be as numerically robust
# proj_eigen = {k: np.round(v+0.00001, 3) for k, v in proj_eigen.items()}
elem.clear()
return proj_eigen, proj_mag
[docs]
def get_vasprun(vasprun_path: PathLike, parse_mag: bool = True, **kwargs):
"""
Read the ``vasprun.xml(.gz)`` file as a ``pymatgen`` |Vasprun| object.
"""
vasprun_path = str(vasprun_path) # convert to string if Path object
warnings.filterwarnings(
"ignore", category=UnknownPotcarWarning
) # Ignore unknown POTCAR warnings when loading vasprun.xml
# pymatgen assumes the default PBE with no way of changing this within get_vasprun())
warnings.filterwarnings(
"ignore", message="No POTCAR file with matching TITEL fields"
) # `message` only needs to match start of message
default_kwargs = {"parse_dos": False, "exception_on_bad_xml": False}
default_kwargs.update(kwargs)
Vasprun._parse_projected_eigen = partialmethod(parse_projected_eigen, parse_mag=parse_mag)
try:
with warnings.catch_warnings(record=True) as w:
vasprun = Vasprun(find_archived_fname(vasprun_path), **default_kwargs)
for warning in w:
if "XML is malformed" in str(warning.message):
warnings.warn(
f"vasprun.xml file at {vasprun_path} is corrupted/incomplete. Attempting to "
f"continue parsing but may fail!"
)
else: # show warning, preserving original category:
warnings.warn(warning.message, category=warning.category)
except FileNotFoundError as exc:
raise FileNotFoundError(
f"vasprun.xml not found at {vasprun_path}(.gz/.xz/.bz/.lzma). Needed for parsing calculation "
f"output!"
) from exc
return vasprun
[docs]
def get_locpot(locpot_path: PathLike):
"""
Read the ``LOCPOT(.gz)`` file as a ``pymatgen`` ``Locpot`` object.
"""
locpot_path = str(locpot_path) # convert to string if Path object
try:
locpot = Locpot.from_file(find_archived_fname(locpot_path))
except FileNotFoundError:
raise FileNotFoundError(
f"LOCPOT file not found at {locpot_path}(.gz/.xz/.bz/.lzma). Needed for calculating the "
f"Freysoldt (FNV) image charge correction!"
) from None
return locpot
def _get_outcar_path(outcar_path: PathLike, raise_error=True):
outcar_path = str(outcar_path) # convert to string if Path object
try:
return find_archived_fname(outcar_path)
except FileNotFoundError:
if raise_error:
raise FileNotFoundError(
f"OUTCAR file not found at {outcar_path}(.gz/.xz/.bz/.lzma). Needed for calculating the "
f"Kumagai (eFNV) image charge correction."
) from None
[docs]
def get_outcar(outcar_path: PathLike):
"""
Read the ``OUTCAR(.gz)`` file as a ``pymatgen`` |Outcar| object.
"""
outcar_path = _get_outcar_path(outcar_path)
return Outcar(outcar_path)
[docs]
def get_core_potentials_from_outcar(
outcar_path: PathLike, dir_type: str = "", total_energy: list | float | None = None
):
"""
Get the core potentials from the ``OUTCAR`` file, which are needed for the
Kumagai-Oba (eFNV) finite-size correction.
This parser skips the full ``pymatgen`` |Outcar| initialisation/parsing,
to expedite parsing and make it more robust (doesn't fail if ``OUTCAR`` is
incomplete, as long as it has the core potentials information).
Args:
outcar_path (PathLike):
The path to the ``OUTCAR`` file.
dir_type (str):
The type of directory the ``OUTCAR`` is in (e.g. ``bulk`` or
``defect``) for informative error messages.
total_energy (list | float | None):
The already-parsed total energy for the structure. If provided,
will check that the total energy of the ``OUTCAR`` matches this
value / one of these values, and throw a warning if not.
Returns:
np.ndarray:
The core potentials from the last ionic step in the ``OUTCAR``.
"""
# initialise Outcar class without running __init__ method:
outcar = Outcar.__new__(Outcar)
outcar.filename = _get_outcar_path(outcar_path)
core_pots_list = outcar.read_avg_core_poten()
if not core_pots_list:
_raise_incomplete_outcar_error(outcar_path, dir_type=dir_type)
_check_outcar_energy(outcar_path, total_energy=total_energy)
return -1 * np.array(core_pots_list[-1]) # core potentials from last step
def _get_final_energy_from_outcar(outcar_path):
"""
Get the final total energy from an ``OUTCAR`` file, even if the calculation
was not completed.
Templated on the ``OUTCAR`` parsing code from ``pymatgen``, but works even
if the ``OUTCAR`` is incomplete.
"""
e0_pattern = re.compile(r"energy\(sigma->0\)\s*=\s+([\d\-\.]+)")
e0 = None
for line in reverse_readfile(outcar_path):
clean = line.strip()
if e0 is None and (match := e0_pattern.search(clean)):
e0 = float(match[1])
return e0
def _get_core_potentials_from_outcar_obj(
outcar: Outcar, dir_type: str = "", total_energy: list | float | None = None
):
if outcar.electrostatic_potential is None and not outcar.read_avg_core_poten():
_raise_incomplete_outcar_error(outcar, dir_type=dir_type)
_check_outcar_energy(outcar, total_energy=total_energy)
return -1 * np.array(outcar.electrostatic_potential) or -1 * np.array(outcar.read_avg_core_poten()[-1])
def _check_outcar_energy(outcar: Outcar | PathLike, total_energy: list | float | None = None):
if total_energy:
outcar_energy = (
outcar.final_energy if isinstance(outcar, Outcar) else _get_final_energy_from_outcar(outcar)
)
total_energy = total_energy if isinstance(total_energy, list) else [total_energy]
total_energies = set(np.round(total_energy, 3))
formatted_total_energy = "eV, ".join(f"{energy:.3f}" for energy in total_energies) + " eV"
if len(total_energies) == 2: # most cases, final energy and last electronic step energy
formatted_total_energy += "; final energy & last electronic step energy"
if not any(np.isclose(outcar_energy, energy, atol=0.025) for energy in total_energy):
# 0.025 eV tolerance
warnings.warn(
f"The total energies of the provided (bulk) `OUTCAR` ({outcar_energy:.3f} eV), "
f"used to obtain the atomic core potentials for the eFNV correction, and the "
f"`vasprun.xml` ({formatted_total_energy}), used for energies and structures, do not "
f"match. Please make sure the correct file combination is being used!"
)
def _raise_incomplete_outcar_error(outcar: PathLike | Outcar, dir_type: str = ""):
"""
Raise error about supplied ``OUTCAR`` not having atomic core potential
info.
Input outcar is either a path or a ``pymatgen`` |Outcar| object
"""
outcar_info = f"`OUTCAR` at {outcar}" if isinstance(outcar, PathLike) else "`OUTCAR` object"
dir_type = f"{dir_type} " if dir_type else ""
raise ValueError(
f"Unable to parse atomic core potentials from {dir_type}{outcar_info}. This can happen if "
f"`ICORELEVEL` was not set to 0 (= default) in the `INCAR`, the calculation was finished "
f"prematurely with a `STOPCAR`, or the calculation crashed. The Kumagai (eFNV) charge correction "
f"cannot be computed without this data!"
)
[docs]
def get_procar(procar_path: PathLike) -> Procar:
"""
Read the ``PROCAR(.gz)`` file as a ``pymatgen`` |Procar| object.
Previously, ``pymatgen`` |Procar| parsing did not support SOC calculations,
however this was updated in
https://github.com/materialsproject/pymatgen/pull/3890 to use code from
``easyunfold`` (https://smtg-bham.github.io/easyunfold -- a package for
unfolding electronic band structures for symmetry-broken / defect /
dopant systems, with many plotting & analysis tools).
"""
try:
procar_path = find_archived_fname(str(procar_path)) # convert to string if Path object
except FileNotFoundError:
raise FileNotFoundError(f"PROCAR file not found at {procar_path}(.gz/.xz/.bz/.lzma)!") from None
return Procar(procar_path)
def _get_output_files_and_check_if_multiple(
output_file: PathLike = "vasprun.xml", path: PathLike = "."
) -> tuple[PathLike, bool]:
"""
Search for all files with filenames matching ``output_file``, case-
insensitive.
Args:
output_file (PathLike):
The filename to search for (case-insensitive). Should be either
``vasprun.xml``, ``OUTCAR``, ``LOCPOT`` or ``PROCAR``.
path (PathLike):
The path to the directory to search in.
Returns:
Tuple[PathLike, bool]:
The path to the identified file, and a boolean indicating whether
multiple files were found.
"""
if output_file.lower() == "vasprun.xml":
search_patterns = ["vasprun", ".xml"]
else:
search_patterns = [output_file.lower()]
files = os.listdir(path)
output_files = [
filename
for filename in files
if all(i in filename.lower() for i in search_patterns) and not filename.startswith(".")
]
# sort by direct match to {output_file}, direct match to {output_file}.gz, then alphabetically:
if output_files := sorted(
output_files,
key=lambda x: (x == output_file, x == f"{output_file}.gz", x),
reverse=True,
):
output_path = os.path.join(path, output_files[0])
return (output_path, True) if len(output_files) > 1 else (output_path, False)
return (
os.path.join(path, output_file),
False,
) # so `get_X()` will raise an informative FileNotFoundError
_CALC_OUTPUT_MASK = ("vasprun.xml", "vasprun.xml.gz")
"""
Filename patterns that identify calculation output files.
"""
_SUBFOLDER_PRIORITY = [
"vasp_ncl",
"singlepoint",
"final",
"relax",
"vasp_std",
"vasp_nkred_std",
"vasp_gam",
]
"""
Priority order when auto-detecting calculation subfolders.
"""
def _dataframe_of_files(root: Path) -> pd.DataFrame:
"""
Get a dataframe with one row per file under ``root``, found recursively.
Hidden files/directories (names starting with ``"."``) and files
sitting directly in ``root`` (i.e. with only one path component
relative to ``root``) are excluded.
Columns: ``filename``, ``full_path``, ``folder_path``,
``folder_in_root`` (the first path component relative to ``root``).
Args:
root (Path):
Path to the root directory.
Returns:
pd.DataFrame:
One row per discovered file.
"""
root = Path(root)
rows: list[dict[str, Any]] = []
for f in root.rglob("*"): # recursively find all files under root, ignoring hidden folders/files
if f.is_file():
relative_parts = f.relative_to(root).parts
if any(part.startswith(".") for part in relative_parts) or len(relative_parts) < 2:
continue # ignore hidden files and folders, and files in root directory itself
rows.append(
{
"filename": f.name,
"full_path": f,
"folder_path": f.parent,
"folder_in_root": relative_parts[0],
}
)
return pd.DataFrame(rows)
def _get_calc_files_df(root: Path, calc_output_mask: Iterable[str] = _CALC_OUTPUT_MASK) -> pd.DataFrame:
"""
Get a DataFrame of calculation output files (matching ``calc_output_mask``)
found recursively under ``root``, excluding hidden files/directories and
files sitting directly in ``root``.
This is a filtered view of :func:`_dataframe_of_files`.
Args:
root (Path):
Path to the root directory.
calc_output_mask (Iterable[str]):
Iterable of filename patterns to match. Defaults to
``_CALC_OUTPUT_MASK`` (``("vasprun.xml", "vasprun.xml.gz")``).
Matching is case-insensitive.
Returns:
pd.DataFrame:
One row per matching calculation output file.
"""
files_df = _dataframe_of_files(root)
if files_df.empty:
return pd.DataFrame()
pattern = "|".join(map(re.escape, calc_output_mask))
return files_df[files_df["filename"].str.contains(pattern, regex=True, na=False)]
def _determine_subfolder(
files_df: pd.DataFrame,
candidate_folders: list[str] | None = None,
subfolder_priority: list[str] = _SUBFOLDER_PRIORITY,
) -> str:
"""
Pick the highest-priority calculation subfolder name present in
``files_df`` (restricted to rows whose ``folder_in_root`` is in
``candidate_folders``), or ``"."`` if none of the priority names are found.
Args:
files_df (pd.DataFrame):
DataFrame produced by :func:`_dataframe_of_files`, filtered
to calculation output files.
candidate_folders (list[str]):
Top-level folder names to consider. If ``None`` (default),
considers all top-level folder names.
subfolder_priority (list[str]):
Priority order for subfolder names. Defaults to
``_SUBFOLDER_PRIORITY``
(``["vasp_ncl", "singlepoint", "final", "relax", "vasp_std", "vasp_nkred_std", "vasp_gam"]``)
where folder names are compared case-insensitively.
Returns:
str:
The detected subfolder name, or ``"."``.
"""
candidate_folder_df = (
files_df[files_df["folder_in_root"].isin(candidate_folders)]
if candidate_folders is not None
else files_df
)
for subfolder in subfolder_priority:
if any(subfolder in p.name.lower() for p in candidate_folder_df["folder_path"].unique()):
return subfolder
return "."
def _find_calc_outputs(
output_path: PathLike,
subfolder: PathLike | None = None,
) -> tuple[pd.DataFrame, list[str], str]:
"""
Recursively find calculation output files under ``output_path`` and auto-
detect the calculation subfolder when ``subfolder`` is ``None``.
Shared discovery logic used by both :func:`~doped.analysis.DefectsParser`
and :meth:`~doped.chemical_potentials.CompetingPhasesAnalyzer`.
Args:
output_path (PathLike):
Root directory to search.
subfolder (PathLike | None):
Explicit subfolder name (e.g. ``"vasp_std"``). If
``None``, auto-detected using ``_SUBFOLDER_PRIORITY``
(``["vasp_ncl", "singlepoint", "final", "relax", "vasp_std", "vasp_nkred_std", "vasp_gam"]``)
where folder names are compared case-insensitively.
Returns:
tuple[pd.DataFrame, list[str], str]:
``(calc_files_df, candidate_folders, resolved_subfolder)``
where *resolved_subfolder* is ``"."`` when no priority
subfolder is found.
"""
calc_files_df = _get_calc_files_df(Path(output_path))
if calc_files_df.empty:
return pd.DataFrame(), [], "."
candidate_folders = calc_files_df["folder_in_root"].unique().tolist()
resolved_subfolder = (
_determine_subfolder(calc_files_df, candidate_folders) if subfolder is None else str(subfolder)
)
return calc_files_df, candidate_folders, resolved_subfolder
[docs]
def get_defect_type_and_composition_diff(
defect: Structure | Composition,
bulk: Structure | Composition,
_parameter_order_warn: bool = True,
) -> tuple[str, dict]:
"""
Get the difference in composition between a bulk structure and a defect
structure.
Args:
defect (|Structure| | |Composition|):
The defect structure or composition.
bulk (|Structure| | |Composition|):
The bulk structure or composition.
Returns:
tuple[str, dict[str, int]]:
The defect type (``interstitial``, ``vacancy``, ``substitution`` or
``complex``) and the composition difference between the bulk and
defect structures as a dictionary.
"""
if _parameter_order_warn:
_warn_parameter_order("get_defect_type_and_composition_diff") # TODO: Remove in doped v4.1
bulk_comp = bulk.composition if isinstance(bulk, Structure) else bulk
defect_comp = defect.composition if isinstance(defect, Structure) else defect
bulk_comp_dict = bulk_comp.get_el_amt_dict()
defect_comp_dict = defect_comp.get_el_amt_dict()
composition_diff = {
element: int(defect_amount - bulk_comp_dict.get(element, 0))
for element, defect_amount in defect_comp_dict.items()
if int(defect_amount - bulk_comp_dict.get(element, 0)) != 0
}
if len(composition_diff) == 1 and next(iter(composition_diff.values())) == 1:
defect_type = "interstitial"
elif len(composition_diff) == 1 and next(iter(composition_diff.values())) == -1:
defect_type = "vacancy"
elif len(composition_diff) == 2 and all(i in composition_diff.values() for i in [-1, 1]):
defect_type = "substitution"
else:
defect_type = "complex"
if len(composition_diff) > 5: # likely a mistake, warn user:
warnings.warn(
f"The composition difference between the bulk ({bulk_comp_dict}) and defect "
f"({defect_comp_dict}) structures is quite large, suggesting either a large complex defect "
f"or a mistake in the inputs. Beware!"
)
return defect_type, composition_diff
[docs]
def get_defect_type_and_site_indices(
defect_supercell: Structure,
bulk_supercell: Structure,
site_tol: float | None = None, # TODO: Change to 0.5 and add complex defect handling
abs_tol: bool = False,
use_oxi_states: bool = False,
use_rms: bool = False,
) -> tuple[str, list[int], list[int]]:
"""
Get the defect type, and indices of defect sites in the bulk (vacancies /
substitutions) and defect (interstitials / substitutions) supercells.
Defect sites are determined by matching sites in the bulk and defect
structures (by element and distances), according to ``site_tol``.
Note that this assumes consistent cell definitions (lattice vectors and
bases) for the input defect and bulk supercells, and does not perform any
structural re-orientations.
Args:
defect_supercell (|Structure|):
The defect supercell structure.
bulk_supercell (|Structure|):
The bulk supercell structure.
site_tol (float | None):
The (fractional) tolerance for matching sites between the defect
and bulk structures. If ``abs_tol`` is ``False`` (default), then
the distance threshold for matching is set to the product of
``site_tol`` and the shortest bond length in the bulk structure for
the given species, otherwise the value is used directly (as a
length in Å).
If ``None`` (default), the defect is assumed to be a point defect,
and the largest site mismatch is assigned as the defect site.
abs_tol (bool):
Whether to use ``site_tol`` as an absolute distance tolerance (in
Å) instead of a fractional tolerance (in terms of the shortest bond
length in the structure). Default is ``False``.
use_oxi_states (bool):
Whether to use the oxidation states of the sites in the bulk and
defect structures when considering matching sites (such that e.g.
``Fe3+`` and ``Fe2+`` would be considered different species).
Default is ``False``.
use_rms (bool):
Site mapping (using linear assignment) -- used to determine defect
sites -- will be that which minimises either the summed RMS
distances (if ``use_rms`` is ``True``) or just simple linear sum of
distances (if ``False``, default) between all paired sites.
Returns:
defect_type (str):
The type of defect as a string (``interstitial``, ``vacancy`` or
``substitution``).
missing_bulk_site_indices (list[int]):
Indices of sites in the bulk structure that do not match any site
in the defect structure (according to ``site_tol`` choice).
additional_defect_site_indices (list[int]):
Indices of sites in the defect structure that do not match any site
in the bulk structure (according to ``site_tol`` choice).
"""
# TODO: Default is 0.5 (i.e. half the shortest bond length in the bulk structure for the given
# species). -- add to site_tol docstring
bulk_composition = bulk_supercell.composition
defect_composition = defect_supercell.composition
try:
defect_type, comp_diff = get_defect_type_and_composition_diff(defect_composition, bulk_composition)
except RuntimeError as exc:
raise ValueError(
"Could not identify defect type from number of sites in structure: "
f"{len(bulk_supercell)} in bulk vs. {len(defect_supercell)} in defect?"
) from exc
if site_tol is None and defect_type == "complex":
raise ValueError(
f"Based on the composition difference between defect and bulk structures ({comp_diff}), "
f"the defect is a complex defect, but ``site_tol`` is set to ``None`` which enforces the "
f"assumption of a point defect. Please set ``site_tol`` to allow parsing of complex defect "
f"sites."
)
oxi_state_decorated = [ # if all sites in both structures are not oxi-state decorated / neutral
any(i in site.species_string for i in ["+", "-", "0"])
for site in [*bulk_supercell.sites, *defect_supercell.sites]
]
if len(set(oxi_state_decorated)) > 1 and use_oxi_states: # not consistent, ignore oxi states:
warnings.warn(
"`use_oxi_states` was set to `True`, but not all sites in the bulk and defect structures are "
"oxidation state decorated. Setting `use_oxi_states` to `False`."
)
use_oxi_states = False
elt_symbols = {
str(species) if use_oxi_states else species.symbol
for species in bulk_composition.elements + defect_composition.elements
}
additional_defect_site_indices = []
missing_bulk_site_indices = []
distance_matrix = bulk_supercell.distance_matrix
for elt_symbol in elt_symbols:
bulk_species_coords, bulk_species_indices = get_coords_and_idx_of_species(
bulk_supercell, elt_symbol, use_oxi_states=use_oxi_states
)
defect_species_coords, defect_species_indices = get_coords_and_idx_of_species(
defect_supercell, elt_symbol, use_oxi_states=use_oxi_states
)
if bulk_species_indices.size == 0: # extrinsic species
site_dist_tol = None
else:
species_distances = distance_matrix[bulk_species_indices]
species_min_dist = max(species_distances[np.nonzero(species_distances)].min(), 1)
site_dist_tol = site_tol if site_tol is None or abs_tol else site_tol * species_min_dist
site_mapping = _get_site_mapping_from_coords_and_indices(
bulk_species_coords,
defect_species_coords,
lattice=bulk_supercell.lattice,
s1_indices=bulk_species_indices,
s2_indices=defect_species_indices,
use_rms=use_rms,
)
defect_site_mappings = [
mapping
for mapping in site_mapping
if mapping[0] is None or (site_dist_tol is not None and mapping[0] > site_dist_tol)
]
for mapping in defect_site_mappings:
if mapping[1] is not None: # missing bulk site
missing_bulk_site_indices.append(mapping[1])
if mapping[2] is not None: # additional defect site (may be from same matched tuple if dist
additional_defect_site_indices.append(mapping[2]) # greater than site_dist_tol)
return defect_type, missing_bulk_site_indices, additional_defect_site_indices
[docs]
def get_coords_and_idx_of_species(
structure_or_sites: SiteCollection,
species_name: str,
frac_coords: bool = True,
use_oxi_states: bool = False,
) -> tuple[np.ndarray, np.ndarray]:
"""
Get arrays of the coordinates and indices of the given species in the
structure/list of sites.
"""
from doped.utils.efficiency import _parse_site_species_str
coords = []
idx = []
for i, site in enumerate(structure_or_sites):
if _parse_site_species_str(site, wout_charge=not use_oxi_states) == species_name:
coords.append(site.frac_coords if frac_coords else site.coords)
idx.append(i)
return np.array(coords), np.array(idx)
[docs]
def get_matching_site(
site: PeriodicSite | np.ndarray, structure: Structure, anonymous: bool = False, tol: float = 0.5
) -> PeriodicSite:
"""
Get the (closest) matching |PeriodicSite| in ``structure`` for the input
``site``, which can be a |PeriodicSite| or fractional coordinates.
If the closest matching site in ``structure`` is > ``tol`` Å (0.5 Å by
default) away from the input ``site`` coordinates, an error is raised.
Automatically accounts for possible differences in assigned oxidation
states, site property dicts etc.
Args:
site (|PeriodicSite| | np.ndarray):
The site for which to find the closest matching site in
``structure``, either as a |PeriodicSite| or fractional
coordinates array. If fractional coordinates, then ``anonymous``
is set to ``True``.
structure (|Structure|):
The structure in which to search for matching sites to ``site``.
anonymous (bool):
Whether to use anonymous matching, allowing different
species/elements to match each other (i.e. just matching based on
coordinates). Default is ``False`` if ``site`` is a
|PeriodicSite|, and ``True`` if ``site`` is fractional
coordinates.
tol (float):
A distance tolerance (in Å), where an error will be thrown if the
closest matching site is > ``tol`` Å away from the input ``site``.
Default is 0.5 Å.
Returns:
PeriodicSite:
The closest matching site in ``structure`` to the input ``site``.
"""
if (
isinstance(site, PeriodicSite) and not anonymous
): # try directly match first if site in structure:
if site in structure:
return site
site_w_no_ox_state = deepcopy(site)
remove_site_oxi_state(site_w_no_ox_state)
site_w_no_ox_state.properties = {}
bulk_sites_w_no_ox_state = structure.copy().sites
for bulk_site in bulk_sites_w_no_ox_state:
remove_site_oxi_state(bulk_site)
bulk_site.properties = {}
if site_w_no_ox_state in bulk_sites_w_no_ox_state:
return structure.sites[bulk_sites_w_no_ox_state.index(site_w_no_ox_state)]
site_frac_coords = (
site.frac_coords if hasattr(site, "frac_coords") else np.array(site, dtype=float)
) # ensure site is in fractional coords
# else get closest site in structure, raising error if not within tol Å:
if isinstance(site, PeriodicSite) and not anonymous: # reduce to only matching species
candidate_frac_coords, candidate_indices = get_coords_and_idx_of_species(
structure, site.specie.symbol
)
else:
candidate_frac_coords = structure.frac_coords
candidate_indices = np.arange(len(structure))
closest_site_idx = candidate_indices[
np.argmin(structure.lattice.get_all_distances(site_frac_coords, candidate_frac_coords).ravel())
]
closest_site = structure.sites[closest_site_idx]
closest_site_dist = closest_site.distance_and_image_from_frac_coords(site_frac_coords)[0]
if closest_site_dist > tol:
raise ValueError(
f"Closest site to input defect site ({site}) in bulk supercell is {closest_site} "
f"with distance {closest_site_dist:.2f} Å (greater than {tol} Å and suggesting a likely "
f"mismatch in sites/structures here!)."
)
if (
not anonymous
and isinstance(site, PeriodicSite)
and site.specie.symbol != closest_site.specie.symbol
):
raise ValueError(
f"Closest site to input defect site ({site}) in bulk supercell is {closest_site} "
f"with distance {closest_site_dist:.2f} Å which is a different element! Set `anonymous=True` "
f"to allow matching of different elements/species if this is desired."
)
return closest_site
[docs]
def find_nearest_coords(
candidate_frac_coords: list | np.ndarray,
target_frac_coords: list | np.ndarray,
lattice: Lattice,
return_idx: bool = False,
) -> tuple[list | np.ndarray, int] | list | np.ndarray:
"""
Find the nearest coords in ``candidate_frac_coords`` to
``target_frac_coords``.
If ``return_idx`` is ``True``, also returns the index of the nearest coords
in ``candidate_frac_coords`` to ``target_frac_coords``.
Args:
candidate_frac_coords (list | np.ndarray):
Fractional coordinates (typically from a bulk supercell), to find
the nearest coordinates to ``target_frac_coords``.
target_frac_coords (list | np.ndarray):
The target coordinates to find the nearest coordinates to in
``candidate_frac_coords``.
lattice (|Lattice|):
The lattice object to use with the fractional coordinates.
return_idx (bool):
Whether to also return the index of the nearest coordinates in
``candidate_frac_coords`` to ``target_frac_coords``.
"""
if len(np.array(target_frac_coords).shape) > 1:
raise ValueError("`target_frac_coords` should be a 1D array of fractional coordinates!")
distance_matrix = lattice.get_all_distances(candidate_frac_coords, target_frac_coords).ravel()
match = distance_matrix.argmin()
return candidate_frac_coords[match], match if return_idx else candidate_frac_coords[match]
[docs]
def find_missing_idx(
frac_coords1: list | np.ndarray,
frac_coords2: list | np.ndarray,
lattice: Lattice,
):
"""
Find the missing/outlier index between two sets of fractional coordinates
(differing in size by 1), by grouping the coordinates based on the minimum
distances between coordinates or, if that doesn't give a unique match, the
site combination that gives the minimum summed squared distances between
paired sites.
The index returned is the index of the missing/outlier coordinate in the
larger set of coordinates.
Args:
frac_coords1 (list | np.ndarray):
First set of fractional coordinates.
frac_coords2 (list | np.ndarray):
Second set of fractional coordinates.
lattice (|Lattice|):
The lattice object to use with the fractional coordinates.
"""
subset, superset = ( # supa-set
(frac_coords1, frac_coords2)
if len(frac_coords1) < len(frac_coords2)
else (frac_coords2, frac_coords1)
)
# in theory this could be made even faster using ``lll_frac_tol`` as in ``_cart_dists()`` in
# ``pymatgen``, with smart choice of initial ``lll_frac_tol`` and scanning upwards if the match is
# below the threshold tolerance (as in ``StructureMatcher_scan_stol()``), but in practice this
# function seems to be incredibly fast as is. Can revisit if it ever becomes a bottleneck
_vecs, d_2 = pbc_shortest_vectors(lattice, subset, superset, return_d2=True)
site_matches, _ = get_linear_assignment_solution(d_2) # matching superset indices, of len(subset)
return next(iter(set(np.arange(len(superset), dtype=int)) - set(site_matches)))
def _create_unrelaxed_defect_structure(
defect_supercell: Structure,
bulk_supercell: Structure,
defect_site_idx: int | None = None,
bulk_site_idx: int | None = None,
defect_coords: bool = False,
) -> Structure:
"""
Create the unrelaxed defect structure, which corresponds to the bulk
supercell with the unrelaxed defect site.
The unrelaxed defect site corresponds to the vacancy/substitution site in
the pristine (bulk) supercell for vacancies/substitutions, and the `final`
relaxed interstitial site for interstitials (as the assignment of their
initial site is ambiguous).
Args:
defect_supercell (Structure):
The defect structure.
bulk_supercell (Structure):
The bulk structure.
defect_site_idx (int):
The index of the defect site to use in the unreleaxed defect
structure. Just for consistency with the relaxed defect structure.
bulk_site_idx (int):
The index of the site in the bulk structure that corresponds to the
defect site in the defect structure.
defect_coords (bool):
Whether to use the fractional coordinates of the defect site in the
defect structure, or the bulk structure. Irrelevant for vacancies.
Parent functions in ``doped`` use ``True`` for interstitials, and
``False`` for substitutions (i.e. use bulk site coords).
Returns:
Structure:
The unrelaxed defect structure.
"""
unrelaxed_defect_structure = bulk_supercell.copy() # create unrelaxed defect structure
if bulk_site_idx is not None:
unrelaxed_defect_structure.remove_sites([bulk_site_idx])
if defect_site_idx is not None:
defect_site_in_defect = defect_supercell[defect_site_idx]
if not defect_coords and bulk_site_idx is not None:
defect_coords = bulk_supercell[bulk_site_idx].frac_coords
else:
defect_coords = defect_site_in_defect.frac_coords
unrelaxed_defect_structure.insert(defect_site_idx, defect_site_in_defect.species, defect_coords)
return unrelaxed_defect_structure
[docs]
def get_wigner_seitz_radius(lattice: Structure | Lattice) -> float:
"""
Calculates the Wigner-Seitz radius of the structure, which corresponds to
the maximum radius of a sphere fitting inside the cell.
Templated on the ``calc_max_sphere_radius`` function from ``pydefect``,
but rewritten to avoid calling ``vise`` which causes hanging on Windows.
(https://github.com/SMTG-Bham/doped/issues/147).
Args:
lattice (|Structure| | |Lattice|):
The lattice of the structure (either a ``pymatgen`` |Structure|
or |Lattice| object).
Returns:
float:
The Wigner-Seitz radius of the structure.
"""
lattice_matrix = lattice.matrix if isinstance(lattice, Lattice) else lattice.lattice.matrix
distances = np.zeros(3, dtype=float) # copied over from pydefect v0.9.4; avoid vise issues
for i in range(3):
a_i_a_j = np.cross(lattice_matrix[i - 2], lattice_matrix[i - 1])
a_k = lattice_matrix[i]
distances[i] = abs(np.dot(a_i_a_j, a_k)) / np.linalg.norm(a_i_a_j)
return max(distances) / 2.0
[docs]
def check_atom_mapping_far_from_defect(
defect_supercell: Structure,
bulk_supercell: Structure,
defect_coords: np.ndarray,
coords_are_cartesian: bool = False,
displacement_tol: float = 0.5,
fraction_tol: float = 0.2,
warning: bool | str = "verbose",
) -> bool:
"""
Check the displacement of atoms far from the determined defect site, and
warn the user if they are large (often indicates a mismatch between the
bulk and defect supercell definitions).
For sites of a given species outside the Wigner-Seitz radius of the defect
(the radius of the largest sphere which can fit in the cell), a 'large'
displacement is flagged if either the *mean* displacement exceeds
``displacement_tol`` Ångströms (capturing a systematic/global mismatch),
*or* the *fraction* of such sites individually displaced by more than
``displacement_tol`` exceeds ``fraction_tol`` (capturing a partial mismatch
without being triggered by single outlier sites).
Args:
defect_supercell (|Structure|):
The defect structure.
bulk_supercell (|Structure|):
The bulk structure.
defect_coords (np.ndarray):
The coordinates of the defect site.
coords_are_cartesian (bool):
Whether the defect coordinates are in Cartesian or fractional
coordinates. Default is ``False`` (fractional).
displacement_tol (float):
The tolerance for the displacement of individual atoms far from the
defect site, in Ångströms. Default is 0.5 Å.
fraction_tol (float):
The tolerance for the fraction of far-from-defect sites (of a given
species) displaced by more than ``displacement_tol``, above which a
mismatch is flagged. Default is 0.2 (i.e. 20%).
warning (bool, str):
Whether to throw a warning if a mismatch is detected. If
``warning = "verbose"`` (default), the individual atomic
displacements are included in the warning message.
Returns:
bool:
Returns ``False`` if a mismatch is detected, else ``True``.
"""
wigner_seitz_radius = get_wigner_seitz_radius(bulk_supercell.lattice)
defect_frac_coords = (
defect_coords
if not coords_are_cartesian
else bulk_supercell.lattice.get_fractional_coords(defect_coords)
)
bulk_sites_outside_or_at_ws_radius = [ # vectorised for fast computation
bulk_supercell[i]
for i in np.where(
bulk_supercell.lattice.get_all_distances(
bulk_supercell.frac_coords, defect_frac_coords
).ravel()
> np.max((wigner_seitz_radius - 1, 1))
)[0]
]
defect_sites_outside_wigner_radius = [ # vectorised for fast computation
defect_supercell[i]
for i in np.where(
defect_supercell.lattice.get_all_distances(
defect_supercell.frac_coords, defect_frac_coords
).ravel()
> wigner_seitz_radius
)[0]
]
disps_outside_ws: dict[str, list[float]] = {site.specie.symbol: [] for site in bulk_supercell}
for species in bulk_supercell.composition.elements: # divide and vectorise calc for efficiency
bulk_species_outside_near_ws_coords = get_coords_and_idx_of_species(
bulk_sites_outside_or_at_ws_radius, species.name
)[0]
defect_species_outside_ws_coords = get_coords_and_idx_of_species(
defect_sites_outside_wigner_radius, species.name
)[0]
if (
min(
len(bulk_species_outside_near_ws_coords),
len(defect_species_outside_ws_coords),
)
== 0
):
continue # if no sites of this species outside the WS radius, skip
site_mapping_outside_ws = _get_site_mapping_from_coords_and_indices(
defect_species_outside_ws_coords,
bulk_species_outside_near_ws_coords,
lattice=bulk_supercell.lattice,
)
displacement_dists = [dist for dist, _i, _j in site_mapping_outside_ws if dist is not None]
disps_outside_ws[species.name].extend(np.round(displacement_dists, 2))
if large_disps_outside_ws := {
specie: list
for specie, list in disps_outside_ws.items()
if list
and (
np.mean(list) > displacement_tol # mean displacement of ``specie`` sites exceeds tolerance
or np.mean(np.array(list) > displacement_tol) > fraction_tol # significant fraction exceed
)
}:
message = (
f"Detected atoms far from the defect site (>{wigner_seitz_radius:.2f} Å) with major "
f"displacements (>{displacement_tol} Å) in the defect supercell. This likely indicates a "
f"mismatch between the bulk and defect supercell definitions (-> see troubleshooting docs) or "
f"an unconverged supercell size, both of which could cause errors in parsing. The mean (or at "
f"least {fraction_tol:.0%}) of displacements of the following species, at sites far from the "
f"determined defect position, is >{displacement_tol} Å: {list(large_disps_outside_ws.keys())}"
)
if warning == "verbose":
message += f", with displacements (Å): {large_disps_outside_ws}"
if warning:
warnings.warn(message)
return False
return True
def _get_site_mapping_from_coords_and_indices(
s1_coords: ArrayLike,
s2_coords: ArrayLike,
s1_indices: np.ndarray | None = None,
s2_indices: np.ndarray | None = None,
lattice: Lattice | None = None,
use_rms: bool = False,
) -> list[tuple[float | None, int | None, int | None]]:
"""
Get the site mapping between two sets of coordinates and indices, based on
the shortest distances between sites.
The coordinates are treated as fractional coordinates if ``lattice`` is
provided, otherwise they are treated as Cartesian coordinates.
Args:
s1_coords (np.ndarray[float]):
The fractional coordinates of the first set of sites.
s2_coords (np.ndarray[float]):
The fractional coordinates of the second set of sites.
s1_indices (np.ndarray[int] | None):
The indices of the first set of sites. If ``None``, the indices are
assumed to be the range of the number of sites in ``s1_coords``.
s2_indices (np.ndarray[int] | None):
The indices of the second set of sites. If ``None``, the indices are
assumed to be the range of the number of sites in ``s2_coords``.
lattice (Lattice | None):
The lattice of the structures. If ``None``, the identity matrix is
used (corresponding to the assumption that the input coordinates
are Cartesian).
use_rms (bool):
The returned site mapping (using linear assignment) will be that
which minimises either the summed RMS distances (if ``use_rms`` is
``True``) or just simple linear sum of distances (if ``False``,
default) between all paired sites.
Returns:
list:
A list of lists containing the distance, index from ``s1_indices``
and index from ``s2_indices`` for each matched site.
"""
lattice = lattice or Lattice(np.eye(3))
s1_coords = np.array(s1_coords)
s2_coords = np.array(s2_coords)
if s1_indices is None:
s1_indices = np.arange(len(s1_coords))
if s2_indices is None:
s2_indices = np.arange(len(s2_coords))
for empty_coords, indices, tuple_idx in [(s1_coords, s2_indices, 2), (s2_coords, s1_indices, 1)]:
if empty_coords.size == 0: # handly case of empty input coords
if indices is None:
return [(None, None, None)]
return [
(None, None if tuple_idx == 2 else int(i), None if tuple_idx == 1 else int(i))
for i in indices
]
s1_is_subset = len(s1_coords) < len(s2_coords)
subset_fcoords, subset_indices = (s1_coords, s1_indices) if s1_is_subset else (s2_coords, s2_indices)
superset_fcoords, superset_indices = (
(s2_coords, s2_indices) if s1_is_subset else (s1_coords, s1_indices)
)
_vecs, d_2 = pbc_shortest_vectors(lattice, subset_fcoords, superset_fcoords, return_d2=True)
dists = np.sqrt(d_2)
site_matches, _ = get_linear_assignment_solution(d_2 if use_rms else dists)
site_mapping = [ # site_matches -> matching superset indices, of len(subset)
(dists[i, j], subset_indices[i], superset_indices[j]) for i, j in enumerate(site_matches)
]
for missing_index in set(range(len(superset_fcoords))) - set(site_matches):
site_mapping.append((None, None, superset_indices[missing_index])) # unmatched sites
if not s1_is_subset: # swap tuple order, to match (dist, s1_index, s2_index)
site_mapping = [(dist, index2, index1) for dist, index1, index2 in site_mapping]
return site_mapping
[docs]
def get_site_mappings(
struct1: Structure,
struct2: Structure,
species: SpeciesLike | None = None,
allow_duplicates: bool = False,
threshold: float = 2.0,
anonymous: bool = False,
ignored_species: list[str] | None = None,
frac_coords: bool = True,
use_rms: bool = False,
) -> list[tuple[float | None, int | None, int | None]]:
"""
Get the site mappings between two structures (from ``struct1`` to
``struct2``), based on the shortest distances between sites.
The two structures may have different species orderings.
NOTE: if ``frac_coords = True`` (default), this assumes that both
structures have the same lattice definitions (i.e. that they match, and
aren't rigidly translated/rotated with respect to each other), which is
mostly the case unless we have a mismatching defect/bulk supercell (in
which case the ``check_atom_mapping_far_from_defect`` warning should be
thrown anyway during parsing).
Args:
struct1 (|Structure|):
The input structure.
struct2 (|Structure|):
The template structure.
species (str):
If provided, only sites of this species will be considered when
matching sites. Default is ``None`` (all species).
allow_duplicates (bool):
If ``True``, allow multiple sites in ``struct1`` to be matched to
the same site in ``struct2``. Default is ``False``.
threshold (float):
If the distance between a pair of matched sites is larger than
this, then a warning will be thrown. Default is 2.0 Å.
anonymous (bool):
If ``True``, the species of the sites will not be considered when
matching sites. Default is ``False`` (only matching species can be
matched together).
ignored_species (list[str]):
A list of species to ignore when matching sites. Default is no
species ignored.
frac_coords (bool):
Whether to match sites based on their fractional coordinate
distances (i.e. assuming PBC with matching lattice definitions,
using the lattice of ``struct1``)(default). If ``False``, instead
matches sites based on distances between their Cartesian
coordinates, with no consideration of PBC.
use_rms (bool):
The returned site mapping (using linear assignment -- only
applicable when ``allow_duplicates`` is ``False``) will be that
which minimises either the summed RMS distances (if ``use_rms`` is
``True``) or just simple linear sum of distances (if ``False``,
default) between all paired sites.
Returns:
list:
A list of lists containing the distance, index in ``struct1`` and
index in ``struct2`` for each matched site.
"""
def get_coords(site: PeriodicSite):
return list(site.frac_coords) if frac_coords else list(site.coords)
def get_distances(
coords1: np.ndarray | list, coords2: np.ndarray | list, lattice: Lattice | None = None
):
if frac_coords:
assert lattice is not None, "Lattice needs to be given if frac_coords is True!"
return lattice.get_all_distances(coords1, coords2)
return all_distances(coords1, coords2)
# Generate a site matching table between the input and the template
min_dist_with_index: list[tuple] = []
s1_species_symbols = (
[
species.symbol
for species in struct1.composition.elements
if species.symbol not in (ignored_species or [])
]
if not anonymous
else [None]
)
for s1_species_symbol in s1_species_symbols:
if species is not None and s1_species_symbol != species and not anonymous:
continue
# Build (struct1_index, coords) pairs for this species, preserving ``struct1`` order:
species_input = [
(i, get_coords(site))
for i, site in enumerate(struct1)
if (site.specie.symbol == s1_species_symbol or anonymous)
]
input_coords = [coords for _, coords in species_input]
species_s2_indices = [
i for i, site in enumerate(struct2) if (site.specie.symbol == s1_species_symbol or anonymous)
]
template_coords = [get_coords(struct2[i]) for i in species_s2_indices]
dmat = (
get_distances(input_coords, template_coords, lattice=struct1.lattice)
if template_coords
else None
)
dmat = dmat**2 if (use_rms and dmat is not None) else dmat # square if use_rms is True
# TODO: Can _get_site_mapping_from_coords_and_indices be used instead here (with minimal
# efficiency) loss:?
if not allow_duplicates and dmat is not None:
# Use linear assignment for order-independent optimal matching.
# get_linear_assignment_solution returns (col_ind, total_cost), where col_ind[i] is the
# template index assigned to input row i (requires n_rows <= n_cols). For n > m, transpose
# the problem (assign each template to one input) and invert the mapping:
if len(input_coords) <= len(template_coords):
tmpl_col_indices, _ = get_linear_assignment_solution(dmat)
input_to_template = dict(enumerate(tmpl_col_indices.tolist()))
else:
# dmat.T is (n_templates, n_inputs): each template row j is assigned input column
# input_col_indices[j]. We need input_idx -> tmpl_idx for the loop below:
input_col_indices, _ = get_linear_assignment_solution(dmat.T)
input_to_template = {int(input_col_indices[j]): j for j in range(len(template_coords))}
else:
input_to_template = None
for input_idx, (index, _) in enumerate(species_input):
if dmat is None:
min_dist_with_index.append((None, index, None))
continue
if input_to_template is not None:
if input_idx not in input_to_template:
# No unique template available (more inputs than templates for this species)
min_dist_with_index.append((None, index, None))
continue
tmpl_idx = input_to_template[input_idx]
else: # allow_duplicates=True: each input independently picks its closest template
dists = dmat[input_idx]
tmpl_idx = dists.argmin()
current_dist = float(dmat[input_idx, tmpl_idx]) ** (0.5 if use_rms else 1)
# Map species-local template index (tmpl_idx) to global struct2 index (species_s2_indices):
template_index = species_s2_indices[tmpl_idx]
if current_dist > threshold:
warnings.warn(
f"Large site displacement {current_dist:.2f} Å detected when matching atomic sites: "
f"{struct1[index]} -> {struct2[template_index]}."
)
min_dist_with_index.append((current_dist, index, template_index))
if not min_dist_with_index:
raise RuntimeError(
f"No matching sites for species {species} found between the two structures!\n"
f"Struct1 composition: {struct1.composition}, Struct2 composition: {struct2.composition}"
)
return min_dist_with_index
[docs]
def reorder_s2_like_s1(s1_structure: Structure, s2_structure: Structure, threshold=5.0) -> Structure:
"""
Reorder the atoms of a (relaxed) structure, ``s2_structure``, to match the
ordering of the atoms in ``s1_structure``.
s1/s2 structures may have a different species orderings.
NOTE: This assumes that both structures have the same lattice definitions
(i.e. that they match, and aren't rigidly translated/rotated with respect
to each other), which is mostly the case unless we have a mismatching
defect/bulk supercell (in which case the
``check_atom_mapping_far_from_defect`` warning should be thrown anyway
during parsing).
Args:
s1_structure (|Structure|):
The template structure.
s2_structure (|Structure|):
The structure to reorder, to match ``s1_structure``.
threshold (float):
If the distance between a pair of matched sites is larger than
this value in Å, then a warning will be thrown. Default is 5.0 Å.
Returns:
Structure:
``s2_structure`` reordered to match ``s1_structure``.
"""
# This function was previously used to ensure correct site matching when pulling site potentials for
# the eFNV Kumagai correction, though no longer used for this purpose. If threshold is set to a low
# value, it will raise a warning if there is a large site displacement detected.
if len(s2_structure) != len(s1_structure):
raise ValueError("Structure reordering not possible, structures have different number of sites.")
# Obtain site mapping between the initial_relax_structure and the unrelaxed structure
mapping = get_site_mappings(s1_structure, s2_structure, threshold=threshold)
mapping = sorted(mapping, key=lambda x: cast("int", x[1])) # sort by s1 index (to match s1 ordering)
# Reorder s2_structure so that it matches the ordering of s1_structure
reordered_sites = [s2_structure[mapping_tuple[-1]] for mapping_tuple in mapping]
# Avoid warning about selective_dynamics properties (can happen if user explicitly set "T T T" (or
# otherwise) for the bulk):
with warnings.catch_warnings():
warnings.filterwarnings("ignore", message="Not all sites have property")
return Structure.from_sites(reordered_sites)
def _compare_potcar_symbols(
defect_potcar_symbols,
bulk_potcar_symbols,
defect_name="defect",
bulk_name="bulk",
warn=True,
only_matching_elements=False,
):
"""
Check all POTCAR symbols in the bulk are the same in the defect
calculation.
Returns True if the symbols match, otherwise returns a list of the symbols
for the bulk and defect calculations.
"""
if only_matching_elements:
defect_elements = [symbol["titel"].split()[1].split("_")[0] for symbol in defect_potcar_symbols]
symbols_to_check = [
symbol
for symbol in bulk_potcar_symbols
if symbol["titel"].split()[1].split("_")[0] in defect_elements
]
else:
symbols_to_check = bulk_potcar_symbols
bulk_mismatch_list = []
defect_mismatch_list = []
for symbol in symbols_to_check:
if symbol["titel"] not in [symbol["titel"] for symbol in defect_potcar_symbols]:
if warn:
warnings.warn(
f"The POTCAR symbols for your {defect_name} and {bulk_name} calculations do not "
f"match, which is likely to cause severe errors in the parsed results. Found the "
f"following symbol in the {bulk_name} calculation:"
f"\n{symbol['titel']}\n"
f"but not in the {defect_name} calculation:"
f"\n{[symbol['titel'] for symbol in defect_potcar_symbols]}\n"
f"The same POTCAR settings should be used for all calculations for accurate results!"
)
if not only_matching_elements:
return [defect_potcar_symbols, bulk_potcar_symbols]
bulk_mismatch_list.append(symbol)
defect_mismatch_list.append(
next(
def_symbol
for def_symbol in defect_potcar_symbols
if def_symbol["titel"].split()[1].split("_")[0]
== symbol["titel"].split()[1].split("_")[0]
)
)
if bulk_mismatch_list:
return [defect_mismatch_list, bulk_mismatch_list]
return True
def _compare_kpoints(
defect_actual_kpoints,
bulk_actual_kpoints,
defect_kpoints=None,
bulk_kpoints=None,
defect_name="defect",
bulk_name="bulk",
warn=True,
):
"""
Check bulk and defect KPOINTS are the same, using the
``Vasprun.actual_kpoints`` lists (i.e. the VASP IBZKPTs essentially).
Returns ``True`` if the KPOINTS match, otherwise returns a list of the
KPOINTS for the bulk and defect calculations.
"""
# sort kpoints, in case same KPOINTS just different ordering:
sorted_bulk_kpoints = sorted(np.array(bulk_actual_kpoints), key=tuple)
sorted_defect_kpoints = sorted(np.array(defect_actual_kpoints), key=tuple)
actual_kpoints_eq = len(sorted_bulk_kpoints) == len(sorted_defect_kpoints) and np.allclose(
sorted_bulk_kpoints, sorted_defect_kpoints
)
# if different symmetry settings used (e.g. for bulk), actual_kpoints can differ but are the same
# input kpoints, which we assume is fine:
kpoints_eq = (
(
bulk_kpoints.kpts == defect_kpoints.kpts
and np.allclose(bulk_kpoints.kpts_shift, defect_kpoints.kpts_shift)
)
if bulk_kpoints and defect_kpoints
else False
)
if not (actual_kpoints_eq or kpoints_eq):
if warn:
formatted_defect_kpts = [[float(kpt) for kpt in kpoints] for kpoints in sorted_defect_kpoints]
formatted_bulk_kpts = [[float(kpt) for kpt in kpoints] for kpoints in sorted_bulk_kpoints]
warnings.warn( # list form is more readable
f"The KPOINTS for your {defect_name} and {bulk_name} calculations do not match, which is "
f"likely to cause errors in the parsed results. Found the following KPOINTS in the "
f"{defect_name} calculation:"
f"\n{formatted_defect_kpts}\n"
f"and in the {bulk_name} calculation:"
f"\n{formatted_bulk_kpts}\n"
f"In general, the same KPOINTS settings should be used for all final calculations for "
f"accurate results!"
)
return [
[list(kpoints) for kpoints in sorted_defect_kpoints],
[list(kpoints) for kpoints in sorted_bulk_kpoints],
]
return True
def _compare_incar_tags(
defect_incar_dict: dict[str, str | int | float],
bulk_incar_dict: dict[str, str | int | float],
fatal_incar_mismatch_tags: dict[str, str | int | float] | None = None,
ignore_tags: set[str] | None = None,
defect_name: str = "defect",
bulk_name: str = "bulk",
warn: bool = True,
):
"""
Check bulk and defect INCAR tags (that can affect energies) are the same.
Returns True if no mismatching tags are found, otherwise returns a list of
the mismatching tags.
"""
if fatal_incar_mismatch_tags is None:
fatal_incar_mismatch_tags = { # dict of tags that can affect energies and their defaults in VASP
"AEXX": 0.25, # default 0.25
"ENCUT": 0,
"LREAL": False, # default False
"HFSCREEN": 0, # default 0 (None)
"GGA": "PE", # default PE
"LHFCALC": False, # default False
"ADDGRID": False, # default False
"ISIF": 2,
"LASPH": False, # default False
"PREC": "Normal", # default Normal
"PRECFOCK": "Normal", # default Normal
"LDAU": False, # default False
"NKRED": 1, # default 1
"LSORBIT": False, # default False
}
if ignore_tags is not None:
fatal_incar_mismatch_tags = {
key: val for key, val in fatal_incar_mismatch_tags.items() if key not in ignore_tags
}
def _compare_incar_vals(val1, val2):
if isinstance(val1, str):
return val1.split()[0].lower() == str(val2).split()[0].lower()
if isinstance(val1, int | float) and isinstance(val2, int | float):
return np.isclose(val1, val2, rtol=1e-3)
return val1 == val2
mismatch_list = []
for key, val in bulk_incar_dict.items():
if key in fatal_incar_mismatch_tags:
defect_val = defect_incar_dict.get(key, fatal_incar_mismatch_tags[key])
if not _compare_incar_vals(val, defect_val):
mismatch_list.append((key, defect_val, val))
# get any missing keys:
defect_incar_keys_not_in_bulk = set(defect_incar_dict.keys()) - set(bulk_incar_dict.keys())
for key in defect_incar_keys_not_in_bulk:
if key in fatal_incar_mismatch_tags and not _compare_incar_vals(
defect_incar_dict[key], fatal_incar_mismatch_tags[key]
):
mismatch_list.append((key, defect_incar_dict[key], fatal_incar_mismatch_tags[key]))
if mismatch_list:
if warn:
warnings.warn(
f"There are mismatching INCAR tags for your {defect_name} and {bulk_name} calculations "
f"which are likely to cause errors in the parsed results (energies). Found the following "
f"differences:\n"
f"(in the format: (INCAR tag, value in {defect_name} calculation, value in {bulk_name} "
f"calculation)):"
f"\n{mismatch_list}\n"
f"In general, the same INCAR settings should be used in all final calculations for these "
f"tags which can affect energies!"
)
return mismatch_list
return True
def _format_mismatching_incar_warning(mismatching_INCAR_warnings: list[tuple[str, set]]) -> str:
"""
Convenience function to generate a formatted warning string listing
mismatching INCAR tags and their values in a clean output.
Used in ``doped.analysis`` and ``doped.chemical_potentials`` when checking
calculation compatibilities.
Args:
mismatching_INCAR_warnings (list[tuple[str, set]]):
A list of tuples containing the INCAR tag and the set of
mismatching values for that tag.
Returns:
str:
A formatted string listing the mismatching INCAR tags and their
values.
"""
# group by the mismatching tags, so we can print them together:
mismatching_tags_name_list_dict = {
tuple(sorted(mismatching_set)): sorted(
[
name
for name, other_mismatching_set in mismatching_INCAR_warnings
if other_mismatching_set == mismatching_set
]
) # sort for consistency
for mismatching_set in [mismatching for name, mismatching in mismatching_INCAR_warnings]
}
return "\n".join(
[
f"{entry_list}:\n{list(mismatching)}"
for mismatching, entry_list in mismatching_tags_name_list_dict.items()
]
)
[docs]
def get_magnetization_from_vasprun(vasprun: Vasprun) -> int | float | np.ndarray:
"""
Determine the total magnetization from a |Vasprun| object.
For spin-polarised calculations, this is the difference between the number
of spin-up vs spin-down electrons. For non-spin-polarised calculations,
there is no magnetization. For non-collinear (NCL) magnetization (e.g.
spin-orbit coupling (SOC) calculations), the magnetization becomes a vector
(spinor), in which case we take the vector norm as the total magnetization.
VASP does not write the total magnetization to ``vasprun.xml`` file (but
does to the ``OUTCAR`` file), and so here we have to reverse-engineer it
from the eigenvalues (for normal spin-polarised calculations) or the
projected magnetization & eigenvalues (for NCL calculations). For NCL
calculations, we sum the projected orbital magnetizations for all occupied
states, weighted by the `k`-point weights and normalised by the total
orbital projections for each band and `k`-point. This gives the best
estimate of the total magnetization from the projected magnetization array,
but due to incomplete orbital projections and orbital-dependent non-uniform
scaling factors (i.e. completeness of orbital projects for `s` vs `p` vs
`d` orbitals etc.), there can be inaccuracies up to ~30% in the estimated
total magnetization for tricky cases.
Args:
vasprun (|Vasprun|):
The |Vasprun| object from which to extract the total
magnetization.
Returns:
int or float or np.ndarray:
The total magnetization of the system.
"""
# in theory should be able to use vasprun.idos (integrated dos), but this doesn't show
# spin-polarisation / account for NELECT changes from neutral apparently
eigenvalues_and_occs = vasprun.eigenvalues
kweights = np.array(vasprun.actual_kpoints_weights)
# first check if it's a spin-polarised calculation:
if len(eigenvalues_and_occs) == 1 or not vasprun.is_spin:
# non-spin-polarised or NCL calculation:
if not vasprun.parameters.get("LNONCOLLINEAR", False):
return 0 # non-spin polarised calculation
if getattr(vasprun, "projected_magnetization", None) is None:
raise RuntimeError(
"Cannot determine magnetization from non-collinear Vasprun calculation, as this requires "
"the `Vasprun.projected_magnetization` attribute, which is parsed with "
"`Vasprun(parse_projected_eigen=True)` (default in `doped`)."
)
# else NCL calculation:
# need to scale by the summed orbital projections for each band (which should be 1):
# vasprun.projected_eigenvalues[Spin.up].shape -> (nkpoints, nbands, natoms, norbitals)
summed_orbital_projections = vasprun.projected_eigenvalues[Spin.up].sum(axis=(-2, -1))
summed_orbital_projections = np.where(
summed_orbital_projections == 0, 1, summed_orbital_projections
) # avoid division by zero, by setting any zero values to 1
normalisation_factors = 1 / summed_orbital_projections
# vasprun.projected_magnetization.shape -> (nkpoints, nbands, natoms, norbitals, 3 -- x/y/z)
# sum the projected magnetization over atoms and orbitals, then multiply by per-band/kpoint
# normalisation factors:
normalised_proj_mag_per_kpoint_band_direction = (
vasprun.projected_magnetization.sum(axis=(-3, -2)) * normalisation_factors[..., None]
) # [..., None] adds new axis, which allows broadcasting (i.e.
# (nkpoints, nbands, 3) * (nkpoints, nbands, 1) -- adding the "(...,1 )" dimension)
# then multiply by occupancies, sum over bands, multiply by k-point weights, sum over k-points:
return (
(
normalised_proj_mag_per_kpoint_band_direction
* eigenvalues_and_occs[Spin.up][:, :, 1][..., None]
).sum(axis=1)
* kweights[..., None]
).sum(axis=0)
# product of the sum of occupations over all bands, times the k-point weights:
n_spin_up = np.sum(eigenvalues_and_occs[Spin.up][:, :, 1].sum(axis=1) * kweights)
n_spin_down = np.sum(eigenvalues_and_occs[Spin.down][:, :, 1].sum(axis=1) * kweights)
return n_spin_up - n_spin_down
[docs]
def get_nelect_from_vasprun(vasprun: Vasprun) -> int | float:
"""
Determine the number of electrons (``NELECT``) from a |Vasprun| object.
Args:
vasprun (|Vasprun|):
The |Vasprun| object from which to extract ``NELECT``.
Returns:
int or float: The number of electrons in the system.
"""
# can also obtain this (NELECT), charge and magnetization from Outcar objects, worth keeping in mind
# but not needed atm
# in theory should be able to use vasprun.idos (integrated dos), but this doesn't show
# spin-polarisation / account for NELECT changes from neutral apparently
eigenvalues_and_occs = vasprun.eigenvalues
kweights = vasprun.actual_kpoints_weights
# product of the sum of occupations over all bands, times the k-point weights:
nelect = np.sum(eigenvalues_and_occs[Spin.up][:, :, 1].sum(axis=1) * kweights)
if len(eigenvalues_and_occs) > 1:
nelect += np.sum(eigenvalues_and_occs[Spin.down][:, :, 1].sum(axis=1) * kweights)
elif not vasprun.parameters.get("LNONCOLLINEAR", False):
nelect *= 2 # non-spin-polarised or SOC calc
return round(nelect, 2)
[docs]
def get_neutral_nelect_from_vasprun(vasprun: Vasprun, skip_potcar_init: bool = False) -> int:
"""
Determine the number of electrons (``NELECT``) from a |Vasprun| object,
corresponding to a neutral charge state for the structure.
Args:
vasprun (|Vasprun|):
The |Vasprun| object from which to extract ``NELECT``.
skip_potcar_init (bool):
Whether to skip the initialisation of the ``POTCAR`` statistics
(i.e. the auto-charge determination) and instead try to reverse
engineer ``NELECT`` using the ``DefectDictSet``.
Returns:
int:
The number of electrons in the system for a neutral charge state.
"""
nelect = None
if not skip_potcar_init:
with contextlib.suppress(Exception): # try determine charge without POTCARs first:
grouped_symbols = [list(group) for key, group in itertools.groupby(vasprun.atomic_symbols)]
potcar_summary_stats = _get_potcar_summary_stats()
for trial_functional in ["PBE_64", "PBE_54", "PBE_52", "PBE", potcar_summary_stats.keys()]:
if all(
potcar_summary_stats[trial_functional].get(
vasprun.potcar_spec[i]["titel"].replace(" ", ""), False
)
for i in range(len(grouped_symbols))
):
break
nelect = sum( # this is always the NELECT for the bulk
np.array([len(i) for i in grouped_symbols])
* np.array(
[
potcar_summary_stats[trial_functional][
vasprun.potcar_spec[i]["titel"].replace(" ", "")
][0]["ZVAL"]
for i in range(len(grouped_symbols))
]
)
)
if nelect is not None:
return int(nelect)
# else try reverse engineer NELECT using DefectDictSet
from doped.vasp import DefectDictSet
potcar_symbols = [titel.split()[1] for titel in vasprun.potcar_symbols]
potcar_settings = {symbol.split("_")[0]: symbol for symbol in potcar_symbols}
with warnings.catch_warnings(): # ignore POTCAR warnings if not available
warnings.simplefilter("ignore", UserWarning)
return int(
DefectDictSet(
vasprun.structures[-1],
charge_state=0,
user_potcar_settings=potcar_settings,
).nelect
)
def _get_bulk_supercell(defect_entry: DefectEntry):
if hasattr(defect_entry, "bulk_supercell") and defect_entry.bulk_supercell:
return defect_entry.bulk_supercell
if (
hasattr(defect_entry, "bulk_entry")
and defect_entry.bulk_entry
and hasattr(defect_entry.bulk_entry, "structure")
and defect_entry.bulk_entry.structure
):
return defect_entry.bulk_entry.structure
return None
def _get_defect_supercell(defect_entry: DefectEntry):
if hasattr(defect_entry, "defect_supercell") and defect_entry.defect_supercell:
return defect_entry.defect_supercell
if (
hasattr(defect_entry, "sc_entry")
and defect_entry.sc_entry
and hasattr(defect_entry.sc_entry, "structure")
and defect_entry.sc_entry.structure
):
return defect_entry.sc_entry.structure
return None
def _get_unrelaxed_defect_structure(defect_entry: DefectEntry, **kwargs) -> Structure | None:
if (
hasattr(defect_entry, "calculation_metadata")
and defect_entry.calculation_metadata
and "unrelaxed_defect_structure" in defect_entry.calculation_metadata
):
return defect_entry.calculation_metadata["unrelaxed_defect_structure"]
bulk_supercell = _get_bulk_supercell(defect_entry)
if bulk_supercell is not None: # reparse info:
_update_defect_entry_structure_metadata(defect_entry, **kwargs)
return defect_entry.calculation_metadata.get("unrelaxed_defect_structure")
def _get_defect_supercell_frac_coords(
defect_entry: DefectEntry, relaxed=True
) -> np.ndarray | tuple[float, float, float] | None:
sc_defect_frac_coords: np.ndarray | tuple[float, float, float] | None = (
defect_entry.sc_defect_frac_coords
)
site = None
if not relaxed:
site = _get_defect_supercell_site(defect_entry, relaxed=False)
if sc_defect_frac_coords is None and site is None:
site = _get_defect_supercell_site(defect_entry)
if site is not None:
sc_defect_frac_coords = site.frac_coords
return sc_defect_frac_coords
def _get_defect_supercell_site(defect_entry: DefectEntry, relaxed=True, **kwargs) -> PeriodicSite | None:
def _return_defect_supercell_site(defect_entry: DefectEntry, relaxed=True):
if relaxed or defect_entry.defect.defect_type == DefectType.Interstitial:
# always final relaxed site for interstitials (note that "bulk_site" may be guessed initial
# site if it is close enough to the final relaxed site):
if site := getattr(defect_entry, "defect_supercell_site", None):
return site
if defect_entry.sc_defect_frac_coords is not None:
return PeriodicSite(
defect_entry.defect.site.species,
defect_entry.sc_defect_frac_coords,
_get_defect_supercell(defect_entry).lattice,
)
# otherwise we use ``bulk_site``, for relaxed = False (vacancies & substitutions)
if (
hasattr(defect_entry, "calculation_metadata")
and defect_entry.calculation_metadata
and defect_entry.calculation_metadata.get("bulk_site")
):
return defect_entry.calculation_metadata.get("bulk_site")
return None
if defect_supercell_site := _return_defect_supercell_site(defect_entry, relaxed=relaxed):
return defect_supercell_site
# otherwise need to reparse info:
_update_defect_entry_structure_metadata(defect_entry, **kwargs)
return _return_defect_supercell_site(defect_entry, relaxed=relaxed)
def _update_defect_entry_structure_metadata(defect_entry: DefectEntry, overwrite: bool = False, **kwargs):
"""
Helper function to reparse the defect site information for a given
|DefectEntry|, updating the relevant attributes and calculation metadata.
Args:
defect_entry (|DefectEntry|):
The |DefectEntry| object for which to update the defect site
information.
overwrite (bool):
Whether to overwrite existing |DefectEntry| attributes with the
newly parsed values. Default is ``False`` (i.e. only update if the
attributes are not already set).
**kwargs:
Keyword arguments to pass to ``get_equiv_frac_coords_in_primitive``
(such as ``symprec``, ``dist_tol_factor``,
``fixed_symprec_and_dist_tol_factor``, ``verbose``) and/or
|Defect| initialization (such as ``oxi_state``, ``multiplicity``,
``symprec``, ``dist_tol_factor``) in the
``defect_and_info_from_structures`` function.
"""
from doped.analysis import defect_and_info_from_structures
bulk_supercell = _get_bulk_supercell(defect_entry)
defect_supercell = _get_defect_supercell(defect_entry)
(
defect,
defect_site,
defect_structure_metadata,
) = defect_and_info_from_structures(
defect_supercell,
bulk_supercell,
_parameter_order_warn=False,
**kwargs, # pass any additional kwargs (e.g. oxidation state, multiplicity, etc.)
)
if not getattr(defect_entry, "calculation_metadata", None):
defect_entry.calculation_metadata = {}
# update any missing calculation_metadata:
for k, v in defect_structure_metadata.items():
if not defect_entry.calculation_metadata.get(k) or overwrite:
defect_entry.calculation_metadata[k] = v
for attr_name, value in {
"defect": defect,
"sc_defect_frac_coords": defect_site.frac_coords, # _relaxed_ defect site
"defect_supercell_site": defect_site,
"defect_supercell": defect_supercell,
"bulk_supercell": bulk_supercell,
}.items():
if getattr(defect_entry, attr_name, None) is None or overwrite:
setattr(defect_entry, attr_name, value)
def _num_electrons_from_charge_state(structure: Structure, charge_state: int = 0) -> int:
"""
Get the total number of electrons (including core electrons! -- so
different to ``NELECT`` in VASP in most cases) for a given structure and
charge state.
Args:
structure (|Structure|):
The structure for which to get the total number of electrons.
charge_state (int):
The charge state of the system. Default is 0.
Returns:
int:
The total number of electrons in the system, including core
electrons.
"""
total_Z = int(
sum(Element(elt).Z * num for elt, num in structure.composition.get_el_amt_dict().items())
)
return int(total_Z + charge_state)
[docs]
def spin_degeneracy_from_vasprun(vasprun: Vasprun, charge_state: int | None = None) -> int:
"""
Get the spin degeneracy (multiplicity) of a system from a ``VASP`` vasprun
output.
Spin degeneracy is determined by first getting the total magnetization and
thus electron spin (S = N_μB/2 -- where N_μB is the magnetization in Bohr
magnetons (i.e. electronic units, as used in VASP), and using the spin
multiplicity equation: ``g_spin = 2S + 1``. The total magnetization
``N_μB`` is determined using ``get_magnetization_from_vasprun`` (see
docstring for details), and if this fails, then simple spin behaviour is
assumed with singlet (S = 0) behaviour for even-electron systems and
doublet behaviour (S = 1/2) for odd-electron systems.
For non-collinear (NCL) magnetization (e.g. spin-orbit coupling (SOC)
calculations), the magnetization ``N_μB`` becomes a vector (spinor), in
which case we take the vector norm as the total magnetization. This can be
non-integer in these cases (e.g. due to SOC mixing of spin states, as
**_S_** is no longer a good quantum number). As an approximation for these
cases, we round ``N_μB`` to the nearest integer which would be allowed
under collinear magnetism (i.e. even numbers for even-electron systems, odd
numbers for odd-electron systems).
Args:
vasprun (|Vasprun|):
``pymatgen`` |Vasprun| for which to determine spin degeneracy.
charge_state (int):
The charge state of the system, which can be used to determine the
number of electrons. If ``None`` (default), automatically
determines the number of electrons using
``get_nelect_from_vasprun(vasprun)``.
Returns:
int: Spin degeneracy of the system.
"""
if charge_state is None:
num_electrons = get_nelect_from_vasprun(vasprun)
else:
num_electrons = _num_electrons_from_charge_state(vasprun.final_structure, charge_state)
try:
raw_magnetization = get_magnetization_from_vasprun(vasprun)
# take the vector norm as the total magnetization (for NCL (SOC) / vector magnetization):
magnetization = float(np.linalg.norm(raw_magnetization))
# round to nearest possible value (even numbers for even-electron systems, odd for odd-electron):
if num_electrons % 2 == 0: # even-electron system, spin degeneracy = 1, 3, 5, ...
magnetization = round(magnetization / 2) * 2 # nearest even number
else:
magnetization = round((magnetization - 1) / 2) * 2 + 1 # nearest odd number
# spin multiplicity = 2S + 1 = 2(mag/2) + 1 = mag + 1 (where mag is in Bohr magnetons
# i.e. number of electrons, as in VASP):
return abs(magnetization) + 1
except (RuntimeError, TypeError): # NCL calculation without parsed projected magnetization:
return _simple_spin_degeneracy_from_num_electrons(int(num_electrons)) # guess from charge
def _simple_spin_degeneracy_from_num_electrons(num_electrons: int = 0) -> int:
"""
Get the spin degeneracy of a system from the total number of electrons,
assuming simple singlet (S=0) behaviour for even-electron systems or
doublet (S=1/2) behaviour for odd-electron systems.
Spin multiplicity is equal to ``2S + 1``, so 1 for singlets (S = 0), 2 for
doublets (S = 1/2), 3 for triplets (S = 1) etc.
Args:
num_electrons (int): The total number of electrons.
Returns:
int:
The spin multiplicity assuming singlet or doublet behaviour.
"""
return int(num_electrons % 2 + 1)
[docs]
def total_charge_from_vasprun(vasprun: Vasprun) -> int | None:
"""
Determine the total charge state of a system from the vasprun, and compare
to the expected charge state if provided.
Note that if the system is charged, then this function relies on access to
``POTCAR`` data, which can be setup with ``pymatgen`` as detailed on the
:ref:`installation page <setup_potcars_mp_api>`.
Args:
vasprun (|Vasprun|):
``pymatgen`` |Vasprun| object for which to determine the total
charge.
Returns:
int or None:
The total charge state, or ``None`` if it cannot be determined.
"""
if (nelect := vasprun.incar.get("NELECT")) is None:
return 0 # neutral if NELECT not specified
auto_charge = None
with contextlib.suppress(Exception): # otherwise determine neutral NELECT from vasprun & POTCARs:
neutral_nelect = get_neutral_nelect_from_vasprun(vasprun)
auto_charge = -1 * (nelect - neutral_nelect)
if abs(auto_charge) >= 10:
neutral_nelect = get_neutral_nelect_from_vasprun(vasprun, skip_potcar_init=True)
auto_charge = -1 * (nelect - neutral_nelect)
return auto_charge
def _get_bulk_locpot_dict(bulk_path, quiet=False):
bulk_locpot_path, multiple = _get_output_files_and_check_if_multiple("LOCPOT", bulk_path)
if multiple and not quiet:
_multiple_files_warning(
"LOCPOT",
bulk_path,
bulk_locpot_path,
dir_type="bulk",
)
bulk_locpot = get_locpot(bulk_locpot_path)
return {str(k): bulk_locpot.get_average_along_axis(k) for k in [0, 1, 2]}
def _get_bulk_site_potentials(
bulk_path: PathLike, quiet: bool = False, total_energy: list | float | None = None
):
bulk_outcar_path, multiple = _get_output_files_and_check_if_multiple("OUTCAR", bulk_path)
if multiple and not quiet:
_multiple_files_warning(
"OUTCAR",
bulk_path,
bulk_outcar_path,
dir_type="bulk",
)
return get_core_potentials_from_outcar(bulk_outcar_path, dir_type="bulk", total_energy=total_energy)
def _update_defect_entry_charge_corrections(defect_entry, charge_correction_type):
meta = defect_entry.calculation_metadata[f"{charge_correction_type}_meta"]
corr = (
meta[f"{charge_correction_type}_electrostatic"]
+ meta[f"{charge_correction_type}_potential_alignment_correction"]
)
defect_entry.corrections.update({f"{charge_correction_type}_charge_correction": corr})
_vasp_file_parsing_action_dict = {
"vasprun.xml": "parse the calculation energy and metadata.",
"OUTCAR": "parse core levels and compute the Kumagai (eFNV) image charge correction.",
"LOCPOT": "parse the electrostatic potential and compute the Freysoldt (FNV) charge correction.",
}
def _multiple_files_warning(file_type, directory, chosen_filepath, action=None, dir_type="bulk"):
filename = os.path.basename(chosen_filepath)
if action is None:
action = _vasp_file_parsing_action_dict[file_type]
warnings.warn(
f"Multiple `{file_type}` files found in {dir_type} directory: {directory}. Using {filename} to "
f"{action}"
)
[docs]
def get_dimer_bonds(structure: Structure, rtol: float = 1.05) -> dict[str, list[float]]:
"""
Get a dictionary of all homoionic (dimer) bonds in the structure.
This function uses the ``get_homoionic_bonds`` and
``get_dimer_bond_length`` functions from ``shakenbreak`` to identify dimer
bonds in the structure (where any pair of atoms of the same element with
distance < ``rtol * get_dimer_bond_length(elt, elt)`` are considered a
dimer bond), returning a dictionary of the site names and the dimer bond
length.
Args:
structure (|Structure|): The structure to get the dimer bond lengths for.
rtol (float):
The relative tolerance to use for classifying bonds as dimer bonds,
where distances < ``rtol * get_dimer_bond_length(elt, elt)`` are
considered dimer bonds. Default is 1.05.
Returns:
dict[str, list[float]]:
A dictionary of element names with values being sub-dictionaries of
site names and their homoionic neighbours and distances (in Å)
which are classified as dimer bonds.
(e.g. {'O': {'O(1)': {'O(3)': '1.44 Å'}}})
"""
from shakenbreak.analysis import get_homoionic_bonds
from shakenbreak.distortions import get_dimer_bond_length
dimer_bond_dict = {
str(elt): get_homoionic_bonds(
structure=structure,
elements=str(elt),
radius=rtol * get_dimer_bond_length(elt, elt),
verbose=False,
)
for elt in structure.composition.elements
}
return {k: v for k, v in dimer_bond_dict.items() if v}