Source code for doped.utils.stenciling

"""
Utility functions to re-generate a relaxed defect structure in a different
supercell.
"""

import math
import warnings
from collections import Counter
from collections.abc import Sequence
from itertools import combinations, product

import numpy as np
from tqdm import tqdm

from doped.analysis import defect_site_from_structures
from doped.core import DefectEntry
from doped.thermodynamics import _ensure_list
from doped.utils.configurations import apply_s2_to_s1_transformation, get_transformation_from_s2_to_s1
from doped.utils.efficiency import (
    Composition,
    Hashabledict,
    Lattice,
    PeriodicSite,
    Structure,
    _cached_Composition_init,
    _Composition__eq__,
)
from doped.utils.parsing import (
    _get_bulk_supercell,
    _get_defect_supercell,
    check_atom_mapping_far_from_defect,
    get_coords_and_idx_of_species,
    get_defect_type_and_composition_diff,
    get_site_mapping_indices,
    get_wigner_seitz_radius,
)
from doped.utils.supercells import _largest_cube_length_from_matrix, min_dist
from doped.utils.symmetry import (
    SymmOp,
    apply_symm_op_to_site,
    apply_symm_op_to_struct,
    get_clean_structure,
    get_distance_matrix,
    get_sga,
    translate_structure,
)


[docs] def get_defect_in_supercell( defect_entry: ( DefectEntry | tuple[Structure, Structure] | tuple[Structure, Structure, list | np.ndarray] ), target_supercell: Structure, check_bulk: bool = True, target_frac_coords: np.ndarray[float] | list[float] | bool = True, edge_tol_range: float | range | list | np.ndarray | None = None, min_dist_tol_factor_range: float | range | list | np.ndarray | None = None, min_dist_warning_tol_factor: float = 0.9, orientation_template_radii_range: float | range | list | np.ndarray | None = None, show_pbar: bool = True, ) -> tuple[Structure, Structure]: """ Re-generate a (relaxed) defect structure in a (arbitrarily) different supercell. This function takes the relaxed defect structure of the input |DefectEntry| (from ``DefectEntry.defect_supercell``), or input |Structure| objects (if ``defect_entry`` is a tuple, see argument descriptions) and re-generates it in the ``target_supercell`` structure (which may be smaller or larger than the original supercell), using the bulk supercell to intelligently pad out the additional missing positions in the new supercell as needed. The defect is placed at the closest possible position to ``target_frac_coords`` (default is the supercell centre = [0.5, 0.5, 0.5]). In most cases, the generated supercell should correspond to the same supercell basis / tiling as the input ``target_supercell``. In some cases, however, this is not possible (in which case a warning is thrown), and so this function also returns the corresponding bulk supercell for the generated supercell (which should be the same for each generated defect supercell given the same ``target_supercell`` and base supercell for ``defect_entry``, see notes below). ``target_supercell`` should be the same host crystal structure, just with different supercell dimensions. The stenciling algorithm is typically robust to small differences in lattice parameters (i.e. volume per atom) and bond lengths, but large differences in these between the original and target bulk supercells can cause issues. Briefly, this function works by: - Translating the defect site to the centre of the original supercell (and applying to the bulk supercell as well). - Identifying a super-supercell of the original supercell which fully encompasses the target supercell (regardless of orientation). - Generate this super-supercell, using one copy of the original defect supercell (``DefectEntry.defect_supercell``), with the rest of the sites (outside of the original defect supercell box, with the defect translated to the centre) populated using the bulk supercell (``DefectEntry.bulk_supercell``). Same for translated bulk supercell. - Orient these super-supercells to (try) match the orientation of ``target_supercell``, to (try) avoid outputting stenciled supercells with different primitive cell tiling (i.e. different structural bases) to the target supercell. - Translate the defect site in these super-supercells to the Cartesian coordinates of the centre of ``target_supercell``, then stencil out all sites in the ``target_supercell`` portion of the super-supercell, accounting for possible site displacements in the relaxed defect supercell (e.g. if ``target_supercell`` has a different shape and does not fully encompass the original defect supercell). This is done by scanning over possible combinations of sites near the boundary regions of the ``target_supercell`` portion (using ``edge_tol`` and ``min_dist_tol_factor`` etc), and identifying the combination which maximises the minimum inter-atomic distance in the new supercell (i.e. the most bulk-like arrangement). Same for bulk super-supercell. - Re-orient this new stenciled supercell (and the corresponding bulk) to (attempt to) match the orientation and site positions of ``target_supercell``. - If ``target_frac_coords`` is not ``False``, scan over all symmetry operations of ``target_supercell`` and apply that which places the defect site closest to ``target_frac_coords``, to both the defect and bulk supercells. Note: While this function tries to ensure that the generated defect/bulk supercell position basis exactly matches that of ``target_supercell``, this is `not` guaranteed. A mismatch can arise due to different tiling of primitive cells within ``target_supercell`` and the original defect supercell, which are symmetry-equivalent for the bulk (pristine) supercell but not for the defect supercell (due to the broken periodicity/symmetry). The generated supercell `is` guaranteed to have the exact same lattice parameters etc. This is perfectly fine if it occurs (in which case a warning will be thrown, unless ``check_bulk`` is ``False``), just will require the use of a matching bulk/reference supercell when parsing (rather than the input ``target_supercell``) to avoid any issues with finite-size corrections and defect site-matching -- ``doped`` will also throw a warning about this when parsing if a non-matching bulk supercell is used anyway. This function returns the corresponding bulk supercell which should be used for parsing defect calculations with the generated supercell. Of course, if generating multiple defects in the same ``target_supercell`` with this issue occurring, only one such bulk supercell calculation should be required (`should` correspond to the same bulk supercell in each case). We note that the algorithm employed here is not guaranteed to be deterministic. It can depend on site orderings, small differences in cell definitions etc. For these small differences, however, the returned stenciled supercell should still be effectively the same, just with small differences in atomic positions near the cell boundaries due to defect-induced strain (which for reasonably large original & stenciling supercells should be negligible). Args: defect_entry (|DefectEntry| | tuple[|Structure|, |Structure|] | tuple[|Structure|, |Structure|, np.ndarray]): A |DefectEntry| object for which to re-generate the relaxed structure (taken from ``DefectEntry.defect_supercell``) in the ``target_supercell`` lattice. Alternatively, a tuple of ``(defect_supercell, bulk_supercell)`` structures can be provided, in which case the defect fractional coordinates are determined automatically (by comparing the two supercells). The defect fractional coordinates can optionally be provided as a third element with the tuple input option (as a ``numpy`` array or list), to skip auto-determination in this case. target_supercell (|Structure|): The supercell structure to re-generate the relaxed defect structure in. check_bulk (bool): Whether to check if the generated defect/bulk supercells have different atomic position bases to ``target_supercell`` (as described above) -- if so, a warning will be printed (unless ``check_bulk`` is ``False``). Default is ``True``. target_frac_coords (np.ndarray[float] | list[float] | bool): The fractional coordinates to target for defect placement in the new supercell. If just set to ``True`` (default), will try to place the defect nearest to the centre of the superset cell (i.e. ``target_frac_coords = [0.5, 0.5, 0.5]``), as is default in ``doped`` defect generation. Note that defect placement is harder in this case than in generation with |DefectsGenerator|, as we are not starting from primitive cells, and we are working with relaxed geometries. If ``False``, does not attempt any defect re-positioning. edge_tol_range (float | range | list | np.ndarray | None): A range or list of tolerances (in Angstrom) for site displacements at the edge of the stenciled supercell to scan over, when determining the best match of sites to stencil out in the new supercell (of ``target_supercell`` dimension). Default is ``None``, in which case the default range is used (0.2 to 4 Å in 0.2 Å increments). Once a stenciling solution which satisfies the ``min_dist_tol_factor`` tolerance is found for a given ``edge_tol``, the search is terminated. See ``stencil_target_cell_from_big_cell`` for further details. min_dist_tol_factor_range (float | range | list | np.ndarray | None): A range or list of tolerance factors for the minimum interatomic distance (relative to the ``bulk_min_bond_length``) at the edge region of the stenciled defect supercell to scan over, when determining the best match of sites to stencil out in the new supercell. Default is ``None``, in which case the default range is used (0.95 to 0.5 in 0.05 increments). See ``stencil_target_cell_from_big_cell`` for further details. min_dist_warning_tol_factor (float): A tolerance factor for the minimum interatomic distance (relative to the ``bulk_min_bond_length``) in the stenciled defect supercell to use as a warning threshold. If the minimum interatomic distance near the edge of the stenciled defect supercell is less than this threshold, a warning is issued. Default is 0.9 (i.e. 90% of the ``bulk_min_bond_length``). orientation_template_radii_range (float | range | list | np.ndarray | None): A range or list of scale factors (relative to the Wigner-Seitz radius of ``target_supercell``) to scan over when constructing the template sub-structures of ``target_supercell`` and the bulk super-supercell used for pre-stenciling orientation matching (to try to ensure matching supercell atomic bases (i.e. tiling of primitive cells in the supercells) in the output stenciled cells with ``target_supercell``). Default is ``None``, in which case the default test range of ``[0.8, 1.0, 0.6, 0.4, 1.2]`` is used. It is expected that this parameter should rarely be required to tune. show_pbar (bool): Whether to show a ``tqdm`` progress bar. Default is ``True``. Returns: tuple[Structure, Structure]: The re-generated defect supercell in the ``target_supercell`` lattice, and the corresponding bulk/reference supercell for the generated defect supercell (see explanations above). """ # Note to self; using Pycharm breakpoints throughout is likely easiest for debugging if show_pbar: pbar = tqdm( total=100, bar_format="{desc}{percentage:.1f}%|{bar}| [{elapsed}, {rate_fmt}{postfix}]" ) # tqdm progress bar. 100% is completion pbar.set_description("Getting super-supercell (relaxed defect + bulk sites)") else: pbar = None bulk_mismatch_warning = False try: if isinstance(defect_entry, tuple): orig_supercell = defect_entry[0].copy() orig_bulk_supercell = defect_entry[1].copy() if len(defect_entry) > 2: orig_defect_frac_coords = np.array(defect_entry[2]) else: defect_site = defect_site_from_structures( orig_supercell, orig_bulk_supercell, _parameter_order_warn=False ) assert isinstance(defect_site, PeriodicSite) orig_defect_frac_coords = defect_site.frac_coords else: orig_supercell = _get_defect_supercell(defect_entry) orig_bulk_supercell = _get_bulk_supercell(defect_entry) orig_defect_frac_coords = defect_entry.sc_defect_frac_coords target_supercell = target_supercell.copy() bulk_min_bond_length = min_dist(orig_bulk_supercell) target_frac_coords = [0.5, 0.5, 0.5] if target_frac_coords is True else target_frac_coords # ensure no oxidation states (for easy composition matching later) for struct in [orig_supercell, orig_bulk_supercell, target_supercell]: struct.remove_oxidation_states() # first translate both orig supercells to put defect in the middle, to aid initial stenciling: orig_def_to_centre = np.array([0.5, 0.5, 0.5]) - orig_defect_frac_coords trans_orig_supercell = translate_structure(orig_supercell, orig_def_to_centre, frac_coords=True) trans_orig_bulk_supercell = translate_structure( orig_bulk_supercell, orig_def_to_centre, frac_coords=True ) # get big_supercell, which is an expanded version of trans_orig_supercell so that _each_ lattice # vector is now bigger than the _largest_ lattice vector in target_supercell (so that # target_supercell is fully encompassed by big_supercell): superset_matrix = _get_superset_matrix_and_supercells(trans_orig_supercell, target_supercell) # add "X" marker (fake species) to defect site in trans_orig_supercell" trans_orig_supercell.append("X", [0.5, 0.5, 0.5], coords_are_cartesian=False) big_defect_supercell_with_X = trans_orig_supercell * superset_matrix big_bulk_supercell = trans_orig_bulk_supercell * superset_matrix # get big bulk supercell # this big_defect_supercell_with_X is with the defect now repeated in it, but we want it with just # one defect, so only take the first repeated defect supercell, and then the rest of the sites from # the expanded bulk supercell: # only keep atoms in big_defect_supercell_with_X which are within the original supercell bounds: big_defect_supercell = _get_matching_sites_from_s1_then_s2( template_struct=trans_orig_supercell, struct1_pool=big_defect_supercell_with_X, struct2_pool=big_bulk_supercell, ) # now we try to pre-emptively reorient the big supercell to match the orientation of # target_supercell (to try avoid output stenciled supercells with different tiling, requiring # additional bulk supercell calculations). # First, we reduce to only atoms closest to the centre for both target and super-supercells, to # greatly reduce comp costs with site-matching enormous supercells; for this, we scan over possible # radii for the sampling sphere as some choices give matches (transformations) and some fail: orientation_template_radii_range = _ensure_list( orientation_template_radii_range if orientation_template_radii_range is not None else [0.8, 1.0, 0.6, 0.4, 1.2] # scale factors of get_wigner_seitz_radius(target_supercell) ) assert isinstance(orientation_template_radii_range, list | np.ndarray) # typing wsr_target = get_wigner_seitz_radius(target_supercell) trans_to_match_target = None fake_target_supercell_w_big_supercell_lattice = None for r_factor in orientation_template_radii_range: fake_target_supercell_sites = target_supercell.get_sites_in_sphere( target_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]), r=r_factor * wsr_target, ) if not fake_target_supercell_sites: continue fake_target_supercell_w_big_supercell_lattice = Structure( big_bulk_supercell.lattice, [site.species for site in fake_target_supercell_sites], [site.coords for site in fake_target_supercell_sites], coords_are_cartesian=True, ) fake_big_supercell = Structure.from_sites( big_bulk_supercell.get_sites_in_sphere( big_bulk_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]), r=r_factor * wsr_target / 0.8, # slightly larger r for fake big supercell ) ) trans_to_match_target = get_transformation_from_s2_to_s1( fake_target_supercell_w_big_supercell_lattice, fake_big_supercell, allow_subset=True, # allow subset to match, likely different number of atoms here max_stol=0.5, ) if trans_to_match_target is not None: break if trans_to_match_target is not None: oriented_big_bulk_supercell = _orient_to_match_target( big_bulk_supercell, fake_target_supercell_w_big_supercell_lattice, trans_to_match_target ) oriented_big_defect_supercell = _orient_to_match_target( big_defect_supercell, fake_target_supercell_w_big_supercell_lattice, trans_to_match_target ) else: oriented_big_bulk_supercell = big_bulk_supercell oriented_big_defect_supercell = big_defect_supercell if pbar is not None: pbar.update(20) # 20% of progress bar pbar.set_description("Getting sites in border region") # translate structure to put defect at the centre of the big supercell (w/frac_coords) big_supercell_defect_site = next( s for s in oriented_big_defect_supercell.sites if s.specie.symbol == "X" ) def_to_centre = np.array([0.5, 0.5, 0.5]) - big_supercell_defect_site.frac_coords oriented_big_defect_supercell = translate_structure( oriented_big_defect_supercell, def_to_centre, frac_coords=True ) oriented_big_bulk_supercell = translate_structure( oriented_big_bulk_supercell, def_to_centre, frac_coords=True ) # get all atoms in big supercell within the cartesian bounds of the target_supercell supercell, # accounting for positional noise: new_bulk_supercell = stencil_target_cell_from_big_cell( oriented_big_bulk_supercell, target_supercell, target_composition=target_supercell.composition, bulk_min_bond_length=bulk_min_bond_length, edge_tol_range=1e-3, min_dist_tol_factor_range=0.99, pbar=None, ) # shouldn't need `edge_tol`, should be much faster than defect supercell stencil new_defect_supercell = stencil_target_cell_from_big_cell( oriented_big_defect_supercell, target_supercell, target_composition=( target_supercell.composition + orig_supercell.composition - orig_bulk_supercell.composition ), bulk_min_bond_length=bulk_min_bond_length, edge_tol_range=edge_tol_range, min_dist_tol_factor_range=min_dist_tol_factor_range, min_dist_warning_tol_factor=min_dist_warning_tol_factor, pbar=pbar, ) if pbar is not None: pbar.update(15) # 55% of progress bar pbar.set_description("Ensuring matching orientation w/target_supercell") # now we try to re-orient the new defect (and bulk) supercells to match the orientation of the # input ``target_supercell``. First we orient the generated _bulk_ supercell to match the # ``target_supercell``, to try to ensure consistency in the generated supercells. # Note: this function is typically the main bottleneck in this workflow. We have already # optimised the underlying |StructureMatcher| workflow in many ways (caching, # fast structure/site/composition comparisons, skipping comparison of defect neighbourhood to # reduce requisite ``stol`` etc; being many orders of magnitude faster than the base # ``pymatgen`` |StructureMatcher|), however the ``_cart_dists()`` function call (used by # ``orient_s2_like_s1``, ``get_transformation_from_s2_to_s1``) is still quite expensive, # especially with large structures with significant noise in the atomic positions... trans = get_transformation_from_s2_to_s1(target_supercell, new_bulk_supercell, max_stol=5) # Note: Here we use a relatively large max stol, because we want to get `some` transformation # regardless of whether a perfect match is possible, as we want a consistent output bulk supercell # (in the case of the stenciled bulk supercell not matching the target supercell), so that only one # additional bulk supercell calculation should be required in this worst case scenario. This # approach gives us a consistent/deterministic output. In theory, we could be faster by breaking at # a lower stol and using an alternative deterministic setting for the new bulk supercell, but not # straightforward and use of fast algorithms and LRU caching minimises the cost here if trans: oriented_new_bulk_supercell = _orient_to_match_target( new_bulk_supercell, target_supercell, trans ) oriented_new_defect_supercell = _orient_to_match_target( new_defect_supercell, target_supercell, trans ) else: warnings.warn( # shouldn't happen for most cases... "No mapping from the (bulk) stenciled cell to ``target_supercell`` could be found. This " "may indicate a severe issue with stenciling for this defect. Please report this issue to " "the developers if there are no obvious causes of this." ) oriented_new_bulk_supercell = new_bulk_supercell oriented_new_defect_supercell = new_defect_supercell oriented_new_defect_site = next( # get defect site and remove X from sites oriented_new_defect_supercell.pop(i) for i, site in enumerate(oriented_new_defect_supercell.sites) if site.specie.symbol == "X" ) if pbar is not None: pbar.update(35) # 90% of progress bar if target_frac_coords is not False: # Scan symm_ops to try and place defect closest to target_frac_coords as possible if pbar is not None: pbar.set_description("Placing defect closest to target_frac_coords") target_symm_op = _scan_symm_ops_to_place_site_closest_to_frac_coords( target_supercell, oriented_new_defect_site, target_frac_coords ) def _apply_symm_op_and_clean_structure(structure: Structure, symm_op: SymmOp) -> Structure: return get_clean_structure( apply_symm_op_to_struct(symm_op, structure, fractional=True, rotate_lattice=False), ) oriented_new_defect_supercell = _apply_symm_op_and_clean_structure( oriented_new_defect_supercell, target_symm_op ) oriented_new_bulk_supercell = _apply_symm_op_and_clean_structure( oriented_new_bulk_supercell, target_symm_op ) if check_bulk: # check orientation bulk_mismatch_warning = not check_atom_mapping_far_from_defect( oriented_new_bulk_supercell, # could use defect or bulk supercell here, bulk cleaner target_supercell, [0.5, 0.5, 0.5], # 'defect' site choice irrelevant here warning=False, ) if pbar is not None: pbar.update(pbar.total - pbar.n) # set to 100% of progress bar finally: if pbar is not None: pbar.close() if bulk_mismatch_warning: # print warning after closing pbar; cleaner warnings.warn( "Note that the atomic position basis of the generated defect/bulk supercell differs from that " "of the ``target_supercell``. This is typically due to different tiling of primitive cells " "within ``target_supercell`` and the original defect supercell, which are symmetry-equivalent " "for the bulk (pristine) supercell but not for the defect supercell (due to the broken " "periodicity/symmetry). For accurate finite-size charge corrections when parsing, a matching " "bulk and defect supercell should be used, and so the matching bulk supercell for the " "generated defect supercell (also returned by this function) should be used for its reference " "host cell calculation.", ) # Two symmetry-equivalent supercells can have identical supercell lattice vectors but different # atomic positions, due to different tiling of primitive cells within the same total supercell -- # think e.g. parallelepids tiled along the long or short sides in a larger (supercell) # parallelepid; so e.g. the max distance between atoms _in the same cell_ can differ etc -- which # causes them to become symmetry-inequivalent upon periodicity-breaking (e.g. introduction of a # defect) # When these different primitive cells are tiled into supercells, the supercell shapes end up # identical but the atoms sit at different Cartesian positions. Crucially, the mapping between the # two is not a single rigid-body transformation (rotation + translation) applied uniformly to all # atoms. # if the original (defect entry) bulk supercell and target supercell differ in bond lengths, # then this can cause some irregularities in bonding within the output bulk supercell (as we # don't rescale when stenciling, so we take original Cartesian coordinates with the target # supercell lattice parameters) here we warn if the minimum bond length in the output bulk # supercell is less than 99% of the original bond length, _and_ if we have a bulk mismatch # warning (as otherwise the output bulk cell shouldn't be used anyway): new_bulk_min_bond_length = min_dist(oriented_new_bulk_supercell) if new_bulk_min_bond_length < bulk_min_bond_length * 0.995: warnings.warn( f"Note that the stenciled `bulk` supercell has a minimum interatomic distance of " f"{new_bulk_min_bond_length:.2f} Å, smaller than 99.5% of the original bond length " f"({bulk_min_bond_length:.2f} Å). This is typically the result of different bond lengths " f"/ interatomic distances between the original bulk and target supercells, which affects " f"the stenciling process. If the difference is relatively small, then this is easily " f"resolved by relaxing the output bulk supercell to re-equilibrate the interatomic " f"distances." ) assert ( get_defect_type_and_composition_diff( orig_supercell, orig_bulk_supercell, _parameter_order_warn=False )[1] == get_defect_type_and_composition_diff( oriented_new_defect_supercell, oriented_new_bulk_supercell, _parameter_order_warn=False )[1] ), ( f"Output composition ({oriented_new_defect_supercell.composition}) does not match expected " f"stenciled defect composition, indicating fatal issues with stenciling here!" ) return oriented_new_defect_supercell, oriented_new_bulk_supercell
def _scan_symm_ops_to_place_site_closest_to_frac_coords( symm_ops: Structure | Sequence[SymmOp], site: PeriodicSite, target_frac_coords: np.ndarray[float] | list[float] | None = None, ) -> SymmOp: """ Given either a list of symmetry operations or a structure (to extract symmetry operations from), scan over all symmetry operations to find the one which places the provided ``site`` closest to the target fractional coordinates. Args: symm_ops (|Structure| | Sequence[SymmOp]): Either a list of symmetry operations or a structure from which to extract symmetry operations. site (|PeriodicSite|): The site to place closest to the target fractional coordinates. target_frac_coords (np.ndarray[float] | list[float] | None): The target fractional coordinates to place the site closest to. Default is ``None``, in which case the site is placed closest to the centre of the supercell (i.e. [0.5, 0.5, 0.5]). """ if isinstance(symm_ops, Structure): sga = get_sga(symm_ops) symm_ops = sga.get_symmetry_operations() target_frac_coords = [0.5, 0.5, 0.5] if target_frac_coords is None else target_frac_coords # get transformed site for each possible ``symm_op`` symm_op_pos_dict = {} for i, symm_op in enumerate(symm_ops): # should check if frac or cartesian is faster symm_opped_site = apply_symm_op_to_site(symm_op, site, fractional=True, rotate_lattice=False) symm_opped_site.to_unit_cell(in_place=True) # faster with in_place symm_op_pos_dict[i] = symm_opped_site.frac_coords # get symm_op which puts defect closest to target_frac_coords: closest_site = min(symm_op_pos_dict.items(), key=lambda x: np.linalg.norm(x[1] - target_frac_coords)) return symm_ops[closest_site[0]]
[docs] def stencil_target_cell_from_big_cell( big_supercell: Structure, target_supercell: Structure, target_composition: Composition | None, edge_tol_range: float | range | list | np.ndarray | None = None, bulk_min_bond_length: float | None = None, min_dist_tol_factor_range: float | range | list | np.ndarray | None = None, min_dist_warning_tol_factor: float = 0.9, pbar: tqdm | None = None, ) -> Structure: """ Given the input ``big_supercell`` and ``target_supercell`` (which should be fully encompassed by the former), stencil out the sites in ``big_supercell`` which correspond to the sites in ``target_supercell`` (i.e. are within the Cartesian bounds of ``target_supercell``). Note that this function assumes that the defect is roughly centred within ``big_supercell`` (i.e. near [0.5, 0.5, 0.5])! The midpoints of ``target_supercell`` and ``big_supercell`` are then aligned within this function, before stenciling. We need to ensure the appropriate number of sites (and their composition) are taken, and that the sites we choose are appropriate for the new supercell (i.e. that if we have e.g. an in-plane contraction, we don't take duplicate atoms that then correspond to tiny inter-atomic distances in the new supercell due to imperfect stenciling under PBC -- so we can't simply take the atoms that are closest to the defect). So, here we scan over possible choices of atoms to include, and take the combination which maximises the `minimum` inter-atomic distance in the new supercell, when accounting for PBCs. Note that differences between the lattice parameters (volumes per atom) of ``big_supercell`` and ``target_supercell`` can cause issues with the stenciling approach herein, particularly when ``target_composition`` is not supplied and/or for defective ``big_supercell`` cells. Args: big_supercell (|Structure|): The supercell structure which fully encompasses ``target_supercell``, from which to stencil out the sites. target_supercell (|Structure|): The supercell structure giving the cell dimensions to stencil out from ``big_supercell``. target_composition (|Composition| | None): Expected composition of the output stenciled cell (used to determine candidate sites to stencil out). Auto-determined by comparing ``big_supercell`` and ``target_supercell`` if ``None`` (default). edge_tol_range (float | range | list | np.ndarray | None): A range or list of tolerances (in Angstrom) for site displacements at the edge of the stenciled supercell to scan over, when determining the best match of sites to stencil out in the new supercell (of ``target_supercell`` dimension). Default is ``None``, in which case the default range is used (0.2 to 4 Å in 0.2 Å increments). In the stenciling search, we scan over ``edge_tol_range`` first, before iterating over ``min_dist_tol_factor_range``. Once a stenciling solution which satisfies the ``min_dist_tol_factor`` tolerance is found for a given ``edge_tol``, the search is terminated. bulk_min_bond_length (float): The minimum interatomic distance in the bulk supercell. Default is ``None``, in which case it is calculated from ``target_supercell``. min_dist_tol_factor_range (float | range | list | np.ndarray | None): A range or list of tolerance factors for the minimum interatomic distance (relative to the ``bulk_min_bond_length``) at the edge region of the stenciled defect supercell to scan over, when determining the best match of sites to stencil out in the new supercell. Default is ``None``, in which case the default range is used (0.95 to 0.5 in 0.05 increments). In the stenciling search, we scan over ``edge_tol_range`` first, before iterating over ``min_dist_tol_factor_range``. Once a stenciling solution which satisfies the ``min_dist_tol_factor`` tolerance is found for a given ``edge_tol``, the search is terminated. min_dist_warning_tol_factor (float): A tolerance factor for the minimum interatomic distance (relative to the ``bulk_min_bond_length``) in the stenciled defect supercell to use as a warning threshold. If the minimum interatomic distance near the edge of the stenciled defect supercell is less than this threshold, a warning is issued. Default is 0.9 (i.e. 90% of the ``bulk_min_bond_length``). pbar (tqdm): ``tqdm`` progress bar object to update (for internal ``doped`` usage). Default is ``None``. Returns: Structure: The stenciled supercell structure. """ edge_tol_range = _ensure_list(edge_tol_range if edge_tol_range is not None else np.arange(0.2, 4, 0.2)) min_dist_tol_factor_range = _ensure_list( min_dist_tol_factor_range if min_dist_tol_factor_range is not None else np.arange(0.95, 0.5, -0.05) ) assert isinstance(edge_tol_range, list | np.ndarray) assert isinstance(min_dist_tol_factor_range, list | np.ndarray) # first, translate sites to put the centre of big_supercell (which should have defect site at the # centre) to the centre of the target supercell (w/cart coords) target_supercell_midpoint_cart_coords = target_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) translation_from_big_defect_middle_to_target_middle = ( target_supercell_midpoint_cart_coords - big_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) ) big_supercell = translate_structure( big_supercell, translation_from_big_defect_middle_to_target_middle, frac_coords=False, to_unit_cell=False, ) if bulk_min_bond_length is None: bulk_min_bond_length = min_dist(target_supercell) if target_composition is None: # get target composition accounting for defect presence in big supercell: num_sc = big_supercell.volume / target_supercell.volume # may be fractional big_supercell_comp_wout_X = big_supercell.copy().remove_species(["X"]).composition target_composition = big_supercell_comp_wout_X - ((num_sc - 1) * target_supercell.composition) target_composition = Composition({k: round(v) for k, v in target_composition.items()}) # here we stencil out sites from the big supercell using the ``target_supercell`` cell (i.e. stencil) # for the bulk supercell this is trivial, as we can just take the ``target_supercell`` box as is, and # it will have the correct composition (assuming the unit cell is the same in both) # for defective supercells (or slightly different bulk supercells, which are also handled natively # here), there can be noise/displacements in the atomic positions, which may be anisotropic etc, such # that the ``target_supercell`` stencil may miss some sites or include additional ones. # Thus, we need to be smart and account for positional noise in this stenciling approach. # To do this, we use an 'edge tolerance' with the ``_get_candidate_supercell_sites`` function to # section out the big supercell sites into: # - ``def_new_supercell_sites``: Super-supercell sites within ``target_supercell`` minus ``edge_tol`` # - ``def_new_supercell_sites_to_check``: Sites in ``def_new_supercell_sites`` within # ``[max(bulk_min_bond_length, edge_tol*2), edge_tol]`` from the target cell edge, used to check for # site overlaps/displacements. # - ``candidate_new_supercell_sites``: Candidate sites for the new supercell, within +/-``edge_tol`` of # the target cell edge, used to determine which sites to include in new supercell, by checking site # overlaps/displacements (combined with ``def_new_supercell_sites_to_check``) and target composition. # We then need to determine the subset of ``candidate_new_supercell_sites`` which (1; easy) matches the # correct composition for our new target supercell, and (2; harder) which minimises discontinuities at # the supercell boundary (i.e. avoiding (nearly-)overlapping sites, which could occur due to # periodically-repeated atomic displacements combined with stenciling of a different supercell # shape/size). For (2), one could try to be very smart and track the atomic displacements in the defect # supercell as a function of distance to defect, or even for each specific site, keep track in the big # supercell, and try determine when the best match has been obtained by comparing max site displacement # near the edge to the known bulk-vs-defect displacement at this range, but would be tricky to # implement, likely not much (if at all) better (as different combinations of supercell shapes will # give different strain fields, such that the best choice of atoms near the stencil boundaries is # not necessarily immediately obvious) and in practice some basic displacement analysis should suffice. # Here for each tested ``edge_tol``, we take the set of candidate/variable sites, remove any which are # overlapping (within 50% of the bulk bond length / min distance; see ``_remove_overlapping_sites``), # from these take the site combinations which match the target composition, determine that which # minimises structural noise by giving the largest minimum interatomic distance in the near-edge # region. If the min interatomic distance here is greater than # ``bulk_min_bond_length * min_dist_tol_factor`` then this is treated as an acceptable solution # iterate over edge_tol first, then min_dist_tol_factor: for min_dist_tol_factor, edge_tol in product(min_dist_tol_factor_range, edge_tol_range): try: ( def_new_supercell_sites, def_new_supercell_sites_to_check, candidate_new_supercell_sites, ) = _get_candidate_supercell_sites( big_supercell, target_supercell, edge_tol, bulk_min_bond_length ) # Determine the composition of sites required to add to ``def_new_supercell_sites`` and match # the target composition: def_new_supercell_sites_struct = Structure.from_sites(def_new_supercell_sites) def_new_supercell_sites_struct.remove_species("X") # def_new_supercell_sites also has X in it def_new_supercell_sites_comp = def_new_supercell_sites_struct.composition target_candidate_composition = ( target_composition - def_new_supercell_sites_comp ) # composition to add num_sites_up_for_grabs = int( sum(target_candidate_composition.values()) ) # number of sites to be placed candidate_new_supercell_sites = _remove_overlapping_sites( candidate_new_supercell_sites=candidate_new_supercell_sites, def_new_supercell_sites_to_check=def_new_supercell_sites_to_check, bulk_min_bond_length=bulk_min_bond_length, pbar=pbar, ) if len(candidate_new_supercell_sites) < num_sites_up_for_grabs: raise RuntimeError( f"Too little candidate sites ({len(candidate_new_supercell_sites)}) to match target " f"composition ({num_sites_up_for_grabs} sites to be placed). Aborting." ) num_combos = math.comb(len(candidate_new_supercell_sites), num_sites_up_for_grabs) if num_combos > 1e10: raise RuntimeError( "Far too many possible site combinations to check, indicating a code failure. " "Aborting, please report this case to the developers!" ) if pbar is not None: pbar.set_description( f"Calculating best match (edge_tol = {edge_tol} Å, possible combos = {num_combos})" ) # 40% of pbar species_symbols = [site.specie.symbol for site in candidate_new_supercell_sites] min_interatomic_distances_tuple_combo_dict = {} idx_combos = list( combinations(range(len(candidate_new_supercell_sites)), num_sites_up_for_grabs) ) if idx_combos and idx_combos != [()]: for idx_combo in idx_combos: if _Composition__eq__( _cached_Composition_init( Hashabledict(Counter([species_symbols[i] for i in idx_combo])) ), target_candidate_composition, ): # could early break cases where the distances are too small? if a bottleneck, # currently not (supercell re-orientation is) -- likely to be a bottleneck with # the latest approach... # And/or could loop over subsets of each combo first, culling any which break this idx_combo_sites_in_target = [candidate_new_supercell_sites[i] for i in idx_combo] fake_candidate_struct_sites = ( def_new_supercell_sites_to_check + idx_combo_sites_in_target ) frac_coords = [site.frac_coords for site in fake_candidate_struct_sites] distance_matrix = get_distance_matrix(frac_coords, target_supercell.lattice) sorted_distances = np.sort(distance_matrix.flatten()) min_interatomic_distances_tuple_combo_dict[ tuple(sorted_distances[len(frac_coords) : len(frac_coords) + 10]) # take the 10 smallest distances for comparison purposes ] = idx_combo_sites_in_target # get the candidate structure with the largest minimum interatomic distance: min_interatomic_distances_tuple_combo_list = sorted( min_interatomic_distances_tuple_combo_dict.items(), key=lambda x: x[0], reverse=True, ) if not min_interatomic_distances_tuple_combo_list: raise RuntimeError("No matching compositions for stenciling found.") # raise RuntimeError and dynamically increase ``edge_tol`` (first, then decrease # ``min_dist_tol_factor``) if resulting min_dist is too small: idx_combo_min_dist = min_interatomic_distances_tuple_combo_list[0][0][0] if idx_combo_min_dist < (bulk_min_bond_length * min_dist_tol_factor): raise RuntimeError( f"Minimum interatomic distance ({idx_combo_min_dist:.2f} Å) near the edge (within " f"{edge_tol:.2f} Å) of the target cell is less than the minimum distance " f"tolerance ({bulk_min_bond_length * min_dist_tol_factor:.2f} Å), indicating a " f"fatal issue with the stenciling process. Aborting." ) new_supercell_sites = def_new_supercell_sites + list( min_interatomic_distances_tuple_combo_list[0][1] ) min_dist_warning_tol = bulk_min_bond_length * min_dist_warning_tol_factor if idx_combo_min_dist < min_dist_warning_tol: inner_range = max(bulk_min_bond_length, edge_tol * 2) warnings.warn( f"Note that the generated stenciled structure has a minimum interatomic distance " f"of {idx_combo_min_dist:.2f} Å near the cell edge (within {inner_range:.2f} Å), " f"smaller than the warning threshold ({min_dist_warning_tol_factor} of the bulk " f"minimum interatomic distance ({bulk_min_bond_length:.2f} Å) = " f"{min_dist_warning_tol:.2f} Å). Some remnant structural noise is of course " f"expected when stenciling with relatively small original/target supercells, so " f"consider if this is reasonable for your system!" ) else: new_supercell_sites = def_new_supercell_sites new_supercell = Structure( lattice=target_supercell.lattice, species=[site.specie for site in new_supercell_sites], coords=[site.coords for site in new_supercell_sites], coords_are_cartesian=True, to_unit_cell=True, ) break except (RuntimeError, ValueError) as e: if edge_tol == edge_tol_range[-1] and min_dist_tol_factor == min_dist_tol_factor_range[-1]: raise e # end of the line and no matches; raise error if pbar is not None: pbar.n = 20 # decrease pbar progress back to 20% pbar.refresh() pbar.set_description( f"Trying edge_tol = {edge_tol:.2f} Å, min_dist_tol_factor = {min_dist_tol_factor:.2f}" ) continue return new_supercell
def _orient_to_match_target( structure: Structure, target_structure: Structure, transformation: tuple ) -> Structure: return apply_s2_to_s1_transformation( target_structure, structure, supercell_matrix=transformation[0], trans_vector=transformation[1], mapping=transformation[2], # new_lattice="struct1", # struct1 by default, no warning if not possible ignored_species=["X"], # ignore X site ) def _get_candidate_supercell_sites( big_supercell: Structure, target_supercell: Structure, edge_tol: float = 0.2, bulk_min_bond_length: float | None = None, ) -> tuple[list[PeriodicSite], list[PeriodicSite], list[PeriodicSite]]: """ Get all atoms in ``big_supercell`` which are within the Cartesian bounds of the ``target_supercell`` supercell. We need to ensure the same number of sites, and that the sites we choose are appropriate for the new supercell (i.e. that if we have e.g. an in- plane contraction, we don't take duplicate atoms that then correspond to tiny inter-atomic distances in the new supercell due to imperfect stenciling under PBC -- so we can't just take the atoms that are closest to the defect). So, this function determines the possible sites to include in the new supercell, the sites which are definitely in the target supercell, and the sites which are near the bordering regions of the target supercell and so may or may not be included (i.e. ``candidate_new_supercell_sites``), using the provided ``edge_tol``. Note that this function assumes that the defect (or any significant atomic displacements) is (are) roughly centred within ``big_supercell`` (i.e. near ``[0.5, 0.5, 0.5]``)! Args: big_supercell (|Structure|): The super-supercell with a single defect supercell and rest of the sites populated by the bulk supercell. target_supercell (|Structure|): The supercell structure to re-generate the relaxed defect structure in. edge_tol (float): A tolerance (in Angstrom) for site displacements at the edge of the ``target_supercell`` supercell, when determining the best match of sites to stencil out in the new supercell. Default is 0.2 Angstrom. bulk_min_bond_length (float): The minimum interatomic distance in the bulk supercell. Default is ``None``, in which case it is calculated from ``target_supercell``. Returns: tuple[list[PeriodicSite], list[PeriodicSite], list[PeriodicSite], Composition, int]: - ``def_new_supercell_sites``: List of sites in the super-supercell which are within the bounds of the target supercell (minus ``edge_tol``) -- 'definite sites in the new supercell'. - ``def_new_supercell_sites_to_check``: List of sites in ``def_new_supercell_sites`` which are near the bordering regions of the target supercell, to check for site overlaps/displacements. Specifically, includes sites within -``[max(bulk_min_bond_length, edge_tol*2), edge_tol]`` of the target cell edges). - ``candidate_new_supercell_sites``: List of candidate sites for the new (target) supercell, within +/-``edge_tol`` of the target cell edge, used to determine which sites to include in the new supercell, by checking site overlaps/displacements (combined with ``def_new_supercell_sites_to_check``) and target composition. """ if bulk_min_bond_length is None: bulk_min_bond_length = min_dist(target_supercell) # Note: This could possibly be made faster by reducing `edge_tol` when the `orig_supercell` is # fully encompassed by `target_supercell` (meaning there should be no defect-induced # displacements at the stenciled cell edges here), by getting the minimum encompassing cube # length (sphere diameter), and seeing if this is smaller than the largest sphere (diameter) # which can be inscribed in the target supercell # either way, this portion of the workflow is not a bottleneck (rather `get_orientation..` is) possible_new_supercell_sites = [ PeriodicSite( site.species, site.coords, lattice=target_supercell.lattice, coords_are_cartesian=True, skip_checks=True, ) for site in big_supercell.sites if is_within_frac_bounds(target_supercell.lattice, site.coords, tol=edge_tol) ] def_new_supercell_site_indices = [] for i, site in enumerate(possible_new_supercell_sites): if is_within_frac_bounds(target_supercell.lattice, site.coords, tol=-edge_tol): def_new_supercell_site_indices.append(i) def_new_supercell_sites = [ site for i, site in enumerate(possible_new_supercell_sites) if i in def_new_supercell_site_indices ] candidate_new_supercell_sites = [ site for i, site in enumerate(possible_new_supercell_sites) if i not in def_new_supercell_site_indices ] def_new_supercell_sites_to_check = [ site for site in def_new_supercell_sites if not is_within_frac_bounds( target_supercell.lattice, site.coords, tol=-max(bulk_min_bond_length, edge_tol * 2) ) ] return ( def_new_supercell_sites, def_new_supercell_sites_to_check, candidate_new_supercell_sites, ) def _remove_overlapping_sites( candidate_new_supercell_sites: list[PeriodicSite], def_new_supercell_sites_to_check: list[PeriodicSite], bulk_min_bond_length: float | None = None, pbar: tqdm | None = None, ) -> list[PeriodicSite]: """ Remove sites in ``candidate_new_supercell_sites`` which overlap (within 50% of the bulk bond length) either with each other (in which case the site which is furthest from any other site in ``candidate_new_supercell_sites`` or ``def_new_supercell_sites_to_check`` is kept) or with sites in ``def_new_supercell_sites_to_check``. Args: candidate_new_supercell_sites (list[|PeriodicSite|]): List of candidate sites in the target supercell to check if overlapping with each other or ``def_new_supercell_sites_to_check``. def_new_supercell_sites_to_check (list[|PeriodicSite|]): List of sites that are in the new (target) supercell but are near the bordering regions, so are used to check for overlapping. bulk_min_bond_length (float): The minimum bond length in the bulk supercell, used to check if inter-site distances are reasonable. If ``None`` (default), determined automatically from ``def_new_supercell_sites_to_check``. pbar (tqdm): ``tqdm`` progress bar object to update (for internal ``doped`` usage). Default is ``None``. Returns: list[PeriodicSite]: The list of candidate sites in the target supercell which do not overlap with each other or with ``def_new_supercell_sites_to_check``. """ if not candidate_new_supercell_sites: if pbar is not None: pbar.update(20) return candidate_new_supercell_sites if bulk_min_bond_length is None: bulk_min_bond_length = min_dist(Structure.from_sites(def_new_supercell_sites_to_check)) check_other_candidate_sites_first = len(candidate_new_supercell_sites) < len( def_new_supercell_sites_to_check ) # check smaller list first for efficiency overlapping_site_indices: set[int] = set() # using indices as faster for comparing than actual sites _pbar_increment_per_iter = max( 0, 20 / len(candidate_new_supercell_sites) - 0.0001 ) # up to 20% of progress bar # pre-compute arrays for vectorised distance checks: lattice = candidate_new_supercell_sites[0].lattice # same lattice for all sites n = len(candidate_new_supercell_sites) cand_frac_coords = np.array([s.frac_coords for s in candidate_new_supercell_sites]) cand_species = [s.specie.symbol for s in candidate_new_supercell_sites] threshold = bulk_min_bond_length * 0.5 # NxN candidate-candidate distance matrix (diagonal set to inf to ignore self-distances): cand_cand_dists = lattice.get_all_distances(cand_frac_coords, cand_frac_coords) np.fill_diagonal(cand_cand_dists, np.inf) # per-candidate minimum distance to any definite site (inf if no definite sites): def_frac_coords = np.array([s.frac_coords for s in def_new_supercell_sites_to_check]) min_cand_def_dists = ( np.min(lattice.get_all_distances(cand_frac_coords, def_frac_coords), axis=1) if def_frac_coords.size else np.full(n, np.inf) ) def _check_other_sites(idx): for other_idx in range(n): if ( idx == other_idx or other_idx in overlapping_site_indices or cand_species[idx] != cand_species[other_idx] ): continue if cand_cand_dists[idx, other_idx] < threshold: # remove the site closest to any other candidate/def site (excluding the overlapping pair) pair_mask = np.ones(n, dtype=bool) pair_mask[[idx, other_idx]] = False min_d_idx = min( np.min(cand_cand_dists[idx, pair_mask], initial=np.inf), min_cand_def_dists[idx] ) min_d_other = min( np.min(cand_cand_dists[other_idx, pair_mask], initial=np.inf), min_cand_def_dists[other_idx], ) overlapping_site_indices.add(idx if min_d_idx <= min_d_other else other_idx) for idx in range(n): if pbar is not None: pbar.update(_pbar_increment_per_iter) if idx in overlapping_site_indices: continue if check_other_candidate_sites_first: _check_other_sites(idx) if ( idx not in overlapping_site_indices and def_frac_coords.size and min_cand_def_dists[idx] < threshold ): overlapping_site_indices.add(idx) if idx not in overlapping_site_indices and not check_other_candidate_sites_first: _check_other_sites(idx) return [ site for i, site in enumerate(candidate_new_supercell_sites) if i not in overlapping_site_indices ] def _get_matching_sites_from_s1_then_s2( template_struct: Structure, struct1_pool: Structure, struct2_pool: Structure, ) -> Structure: """ Generate a stenciled structure from a template sub-set structure and two pools of sites/structures. Given two pools of sites/structures, returns a new structure which has (1) the closest site in ``struct1_pool`` to each site in ``template_struct``, and (2) ``struct2_pool.num_sites - template_struct.num_sites`` sites from ``struct2_pool``, chosen as those with the largest minimum distances from sites in the first set of sites (i.e. the single defect subcell matching ``template_struct``). The targeted use case is transforming a large periodically-repeated super- supercell of a defect supercell (``struct1_pool``) into the same super- supercell but with only one copy of the original defect supercell (``template_struct``), and the rest of the sites populated by the bulk super-supercell (``struct2_pool``; with ``struct2`` being the original bulk supercell). Args: template_struct (|Structure|): The template structure to match. struct1_pool (|Structure|): The first pool of sites to match to the template structure. struct2_pool (|Structure|): The second pool of sites to match to the template structure. Returns: Structure: The stenciled structure. """ num_super_supercells = len(struct1_pool) // len(template_struct) # both also have X single_defect_subcell_sites = [] # Uses linear assignment per species for unambiguous optimal matching, for template_struct sites mapping = get_site_mapping_indices(template_struct, struct1_pool, frac_coords=False, threshold=np.inf) pool1_indices = [ # mapping entries are (dist, template_idx, pool_idx) pool_idx for _, template_idx, pool_idx in mapping if (template_idx is not None and pool_idx is not None) ] assert len(set(pool1_indices)) == len(pool1_indices) # check no duplicate indices single_defect_subcell_sites = [struct1_pool[i] for i in pool1_indices] # for struct2_pool, it's not as straightforward to use ``get_site_mapping_indices``, as we are trying # to compare defect-containing supercells to a bulk supercell, so have to account for different # compositions / number of sites etc; so more straightforward/robust to follow this approach, where we # get the required number of sites from ``struct2_pool``, which are furthest from those in # ``single_defect_subcell_sites``: species_coord_dict = {} # avoid recomputing coords for each site for element in struct2_pool.composition.elements: species_coord_dict[element.symbol] = get_coords_and_idx_of_species( single_defect_subcell_sites, element.symbol, frac_coords=True, # frac_coords )[0] struct2_pool_dists_to_template_centre = struct2_pool.lattice.get_all_distances( struct2_pool.frac_coords, struct2_pool.lattice.get_fractional_coords( template_struct.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) ), ).ravel() # template centre is defect site in stenciling workflow largest_encompassed_cube_length = _largest_cube_length_from_matrix(template_struct.lattice.matrix) candidate_struct2_pool_species_sites: dict[str, list[PeriodicSite]] = { super_site.specie.symbol: [] for super_site in struct2_pool } for dist_to_template_centre, super_site in zip( struct2_pool_dists_to_template_centre, struct2_pool, strict=False ): # screen to sites outside defect WS radius, for efficiency: if dist_to_template_centre > largest_encompassed_cube_length * 0.49: # 2% buffer (cube length / 2) candidate_struct2_pool_species_sites[super_site.specie.symbol].append(super_site) struct2_pool_site_min_dist_dict = {} for species_symbol, species_sites in candidate_struct2_pool_species_sites.items(): dist_matrix = struct2_pool.lattice.get_all_distances( # vectorised for fast computation species_coord_dict[species_symbol], [site.frac_coords for site in species_sites] ) # M x N min_dists = np.min(dist_matrix, axis=0) # down columns struct2_pool_site_min_dist_dict.update(dict(zip(species_sites, min_dists, strict=False))) # sort possible_bulk_outer_cell_sites by (largest) min dist to single_defect_subcell_sites: possible_bulk_outer_cell_sites = sorted( [site for sites in candidate_struct2_pool_species_sites.values() for site in sites], key=lambda x: struct2_pool_site_min_dist_dict[x], reverse=True, ) bulk_outer_cell_sites = possible_bulk_outer_cell_sites[ : len(struct2_pool) - len(struct2_pool) // num_super_supercells ] return Structure.from_sites(single_defect_subcell_sites + bulk_outer_cell_sites) def _get_superset_matrix_and_supercells( structure: Structure, target_supercell: Structure, ) -> np.ndarray[int]: """ Given a structure and a target supercell, return the transformation ('superset') matrix which makes all lattice vectors for the structure larger than or equal to the largest lattice vector in ``target_supercell``. Args: structure (|Structure|): The original structure for which to get the superset matrix that fully encompasses the target supercell. target_supercell (|Structure|): The target supercell. Returns: superset_matrix (np.ndarray[int]): Transformation matrix to make all lattice vectors for ``structure`` larger than or equal to the largest lattice vector in ``target_supercell``. """ min_cell_length = _get_all_encompassing_cube_length(target_supercell.lattice) # get supercell matrix which makes all lattice vectors for orig_supercell larger than min_cell_length: return np.ceil(min_cell_length / structure.lattice.abc) # could possibly use non-diagonal supercells to make this more efficient in some cases, but a lot of # work, shouldn't really contribute much to slowdowns, and only relevant in some rare cases (?) def _get_all_encompassing_cube_length(lattice: Lattice) -> float: """ Get the smallest possible cube that fully encompasses the cell, `regardless of orientation`. This is determined by getting the 8 vertices of the cell and computing the max distance between any two vertices, giving the side length of an all- encompassing cube. Equivalent to getting the diameter of an encompassing sphere. Args: lattice (|Lattice|): The lattice to get the all-encompassing cube length for. Returns: float: The side length of the all-encompassing cube. """ # get all vertices of the cell: cart_vertices = np.array([lattice.get_cartesian_coords(perm) for perm in product([0, 1], repeat=3)]) # get 2D matrix of all distances between vertices: distances = np.linalg.norm(cart_vertices[:, None] - cart_vertices, axis=-1) return max(distances.flatten())
[docs] def is_within_frac_bounds( lattice: Lattice, cart_coords: np.ndarray[float] | list[float], tol: float = 1e-5 ) -> bool: """ Check if a given Cartesian coordinate is inside the unit cell defined by the lattice object. Args: lattice (|Lattice|): |Lattice| object defining the unit cell. cart_coords (np.ndarray[float] | list[float]): The Cartesian coordinates to check. tol (float): A tolerance (in Angstrom / cartesian units) for coordinates to be considered within the unit cell. If positive, expands the bounds of the unit cell by this amount, if negative, shrinks the bounds. Returns: bool: Whether the Cartesian coordinates are within the fractional bounds of the unit cell, accounting for ``tol``. """ frac_coords = lattice.get_fractional_coords(cart_coords) frac_tols = np.array([tol, tol, tol]) / lattice.abc # Check if fractional coordinates are in the range [0, 1) return np.all((frac_coords + frac_tols >= 0) & (frac_coords - frac_tols < 1))
def _convert_defect_neighbours_to_X( defect_supercell: Structure, defect_position: np.ndarray[float], coords_are_cartesian: bool = False, ws_radius_fraction: float = 0.5, ) -> Structure: """ Convert all neighbouring sites of a defect site in a supercell, within ``ws_radius_fraction`` of the Wigner-Seitz radius, to have their species as "X" (storing their original species in the site property dict under the "orig_species" key). Intended to then be used to make structure-matching far more efficient, by ignoring the highly-perturbed defect neighbourhood (which requires larger ``stol`` values which greatly slow down structure-matching). No longer used in default stenciling approach. Args: defect_supercell (|Structure|): The defect supercell to edit. defect_position (np.ndarray[float]): The coordinates of the defect site, either fractional or Cartesian depending on ``coords_are_cartesian``. coords_are_cartesian (bool): Whether the defect position is in Cartesian coordinates. Default is ``False`` (fractional coordinates). ws_radius_fraction (float): The fraction of the Wigner-Seitz radius to use as the cut-off distance for neighbouring sites to convert to species "X". Default is 0.5 (50%). Returns: Structure: The supercell structure with the defect site and all neighbouring sites converted to species X. """ converted_defect_supercell = defect_supercell.copy() ws_radius = get_wigner_seitz_radius(defect_supercell) defect_frac_coords = ( defect_position if not coords_are_cartesian else defect_supercell.lattice.get_fractional_coords(defect_position) ) site_indices_to_convert = np.where( # vectorised for fast computation defect_supercell.lattice.get_all_distances( defect_supercell.frac_coords, defect_frac_coords ).ravel() < np.max((ws_radius * ws_radius_fraction, 1)) )[0] for i, orig_site in enumerate(defect_supercell): if i in site_indices_to_convert: converted_defect_supercell.replace(i, "X") # we set properties for all sites, because ``Structure.from_sites()`` in ``pymatgen``'s # |StructureMatcher| adds properties to all sites, which can then mess with site comparisons... converted_defect_supercell[i].properties["orig_species"] = orig_site.specie.symbol return converted_defect_supercell def _convert_X_back_to_orig_species(converted_defect_supercell: Structure) -> Structure: """ Convert all sites in a supercell with species "X" and "orig_species" in the site property dict back to their original species. Mainly intended just for internal ``doped`` usage, to convert back sites which had been converted to "X" for efficient structure-matching (see ``_convert_defect_neighbours_to_X``). No longer used in default stenciling approach. Args: converted_defect_supercell (|Structure|): The defect supercell to convert back. Returns: Structure: The supercell structure with all sites with species "X" and "orig_species" in the site property dict converted back to their original species. """ defect_supercell = converted_defect_supercell.copy() for i, site in enumerate(defect_supercell): if site.specie.symbol == "X" and site.properties.get("orig_species"): defect_supercell.replace(i, site.properties.pop("orig_species", site.specie.symbol)) return defect_supercell