Source code for doped.utils.stenciling

"""
Utility functions to re-generate a relaxed defect structure in a different
supercell.
"""

import math
import warnings
from collections import Counter
from collections.abc import Sequence
from itertools import combinations, product
from typing import Optional, Union

import numpy as np
from pymatgen.core.sites import PeriodicSite
from pymatgen.core.structure import Composition, Lattice, Structure
from tqdm import tqdm

from doped.core import DefectEntry
from doped.utils.configurations import orient_s2_like_s1
from doped.utils.efficiency import (
    Hashabledict,
    _cached_Composition_init,
    _Composition__eq__,
    _fast_get_composition_from_sites,
)
from doped.utils.parsing import (
    _get_bulk_supercell,
    _get_defect_supercell,
    check_atom_mapping_far_from_defect,
    get_coords_and_idx_of_species,
    get_defect_type_and_composition_diff,
    get_wigner_seitz_radius,
)
from doped.utils.supercells import _largest_cube_length_from_matrix, min_dist
from doped.utils.symmetry import (
    SymmOp,
    apply_symm_op_to_site,
    apply_symm_op_to_struct,
    get_clean_structure,
    get_sga,
    translate_structure,
)


[docs] def get_defect_in_supercell( defect_entry: DefectEntry, target_supercell: Structure, check_bulk: bool = True, target_frac_coords: Union[np.ndarray[float], list[float], bool] = True, edge_tol: float = 1, ) -> tuple[Structure, Structure]: """ Re-generate a relaxed defect structure in a different supercell. This function takes the relaxed defect structure of the input ``DefectEntry`` (from ``DefectEntry.defect_supercell``) and re-generates it in the ``target_supercell`` structure, and the closest possible position to ``target_frac_coords`` (if provided, else closest to centre = [0.5, 0.5, 0.5]), also providing the corresponding bulk supercell (which should be the same for each generated defect supercell given the same ``target_supercell`` and base supercell for ``defect_entry``, see note below). ``target_supercell`` should be the same host crystal structure, just with different supercell dimensions, having the same lattice parameters and bond lengths. Note: This function does _not_ guarantee that the generated defect supercell atomic position basis exactly matches that of ``target_supercell``, which may have come from a different primitive structure definition (e.g. CdTe with ``{"Cd": [0,0,0], "Te": [0.25,0.25,0.25]}`` vs ``{"Cd": [0,0,0], "Te": [0.75,0.75,0.75]}``). The generated supercell _will_ have the exact same lattice/cell definition with fully symmetry-equivalent atom positions, but if the actual position basis differs then this can cause issues with parsing finite-size corrections (which rely on site-matched potentials). This is perfectly fine if it occurs, just will require the use of a matching bulk/reference supercell when parsing (rather than the input ``target_supercell``) -- ``doped`` will also throw a warning about this when parsing if a non-matching bulk supercell is used anyway. This function will automatically check if the position basis in the generated supercell differs from that of ``target_supercell``, printing a warning if so (unless ``check_bulk`` is ``False``) and returning the corresponding bulk supercell which should be used for parsing defect calculations with the generated supercell. Of course, if generating multiple defects in the same ``target_supercell``, only one such bulk supercell calculation should be required (should correspond to the same bulk supercell in each case). Briefly, this function works by: - Translating the defect site to the centre of the original supercell. - Identifying a super-supercell which fully encompasses the target supercell (regardless of orientation). - Generate this super-supercell, using one copy of the original defect supercell (``DefectEntry.defect_supercell``), and the rest of the sites (outside of the original defect supercell box, with the defect translated to the centre) are populated using the bulk supercell (``DefectEntry.bulk_supercell``). - Translate the defect site in this super-supercell to the Cartesian coordinates of the centre of ``target_supercell``, then stencil out all sites in the ``target_supercell`` portion of the super-supercell, accounting for possible site displacements in the relaxed defect supercell (e.g. if ``target_supercell`` has a different shape and does not fully encompass the original defect supercell). This is done by scanning over possible combinations of sites near the boundary regions of the ``target_supercell`` portion, and identifying the combination which maximises the minimum inter-atomic distance in the new supercell (i.e. the most bulk-like arrangement). - Re-orient this new stenciled supercell to match the orientation and site positions of ``target_supercell``. - If ``target_frac_coords`` is not ``False``, scan over all symmetry operations of ``target_supercell`` and apply that which places the defect site closest to ``target_frac_coords``. Args: defect_entry (DefectEntry): A ``DefectEntry`` object for which to re-generate the relaxed structure (taken from ``DefectEntry.defect_supercell``) in the ``target_supercell`` lattice. target_supercell (Structure): The supercell structure to re-generate the relaxed defect structure in. check_bulk (bool): Whether to check if the generated defect/bulk supercells have different atomic position bases to ``target_supercell`` (as described above) -- if so, a warning will be printed (unless ``check_bulk`` is ``False``). Default is ``True``. target_frac_coords (Union[np.ndarray[float], list[float], bool]): The fractional coordinates to target for defect placement in the new supercell. If just set to ``True`` (default), will try to place the defect nearest to the centre of the superset cell (i.e. ``target_frac_coords = [0.5, 0.5, 0.5]``), as is default in ``doped`` defect generation. Note that defect placement is harder in this case than in generation with ``DefectsGenerator``, as we are not starting from primitive cells and we are working with relaxed geometries. edge_tol (float): A tolerance (in Angstrom) for site displacements at the edge of the stenciled supercell, when determining the best match of sites to stencil out in the new supercell (of ``target_supercell`` dimension). Default is 1 Angstrom, and then this is sequentially increased up to 4.5 Angstrom if the initial scan fails. Returns: tuple[Structure, Structure]: The re-generated defect supercell in the ``target_supercell`` lattice, and the corresponding bulk/reference supercell for the generated defect supercell (see explanations above). """ # Note to self; using Pycharm breakpoints throughout is likely easiest way to debug these functions # TODO: Tests!! (At least one of each defect, Se good test case, then at least one or two with # >unary compositions and extrinsic substitution/interstitial) # TODO: We should now be able to use these functions (without the final re-orientation step, # for speed) to determine the point symmetries of relaxed defects in non-symmetry-conserving # supercells, by stenciling into a small symmetry-conserving cell and getting the point symmetry # for that -- will do! pbar = tqdm( total=100, bar_format="{desc}{percentage:.1f}%|{bar}| [{elapsed}, {rate_fmt}{postfix}]" ) # tqdm progress bar. 100% is completion pbar.set_description("Getting super-supercell (relaxed defect + bulk sites)") bulk_mismatch_warning = False try: orig_supercell = _get_defect_supercell(defect_entry) orig_min_dist = min_dist(orig_supercell) orig_bulk_supercell = _get_bulk_supercell(defect_entry) orig_defect_frac_coords = defect_entry.sc_defect_frac_coords target_supercell = target_supercell.copy() bulk_min_bond_length = min_dist(orig_bulk_supercell) target_frac_coords = [0.5, 0.5, 0.5] if target_frac_coords is True else target_frac_coords # ensure no oxidation states (for easy composition matching later) for struct in [orig_supercell, orig_bulk_supercell, target_supercell]: struct.remove_oxidation_states() # first translate both orig supercells to put defect in the middle, to aid initial stenciling: orig_def_to_centre = np.array([0.5, 0.5, 0.5]) - orig_defect_frac_coords orig_supercell = translate_structure(orig_supercell, orig_def_to_centre, frac_coords=True) trans_orig_bulk_supercell = translate_structure( orig_bulk_supercell, orig_def_to_centre, frac_coords=True ) # get big_supercell, which is expanded version of orig_supercell so that _each_ lattice vector is # now bigger than the _largest_ lattice vector in target_supercell (so that target_supercell is # fully encompassed by big_supercell): superset_matrix, big_supercell, big_supercell_with_X, orig_supercell_with_X = ( # supa-set!! _get_superset_matrix_and_supercells(orig_supercell, target_supercell, [0.5, 0.5, 0.5]) ) big_bulk_supercell = orig_bulk_supercell * superset_matrix # get big bulk supercell # this big_supercell is with the defect now repeated in it, but we want it with just one defect, # so only take the first repeated defect supercell, and then the rest of the sites from the # expanded bulk supercell: # only keep atoms in big supercell which are within the original supercell bounds: big_defect_supercell = _get_matching_sites_from_s1_then_s2( orig_supercell_with_X, big_supercell_with_X, trans_orig_bulk_supercell * superset_matrix, orig_min_dist, ) # translate structure to put defect at the centre of the big supercell (w/frac_coords) big_supercell_defect_site = next(s for s in big_defect_supercell.sites if s.specie.symbol == "X") def_to_centre = np.array([0.5, 0.5, 0.5]) - big_supercell_defect_site.frac_coords big_defect_supercell = translate_structure(big_defect_supercell, def_to_centre, frac_coords=True) pbar.update(20) # 20% of progress bar pbar.set_description("Getting sites in border region") # get all atoms in big supercell within the cartesian bounds of the target_supercell supercell: new_defect_supercell = _stencil_target_cell_from_big_cell( big_defect_supercell, target_supercell, bulk_min_bond_length=bulk_min_bond_length, orig_min_dist=orig_min_dist, edge_tol=edge_tol, pbar=pbar, ) new_bulk_supercell = _stencil_target_cell_from_big_cell( big_bulk_supercell, target_supercell, bulk_min_bond_length=bulk_min_bond_length, orig_min_dist=bulk_min_bond_length, edge_tol=1e-3, pbar=None, ) # shouldn't need `edge_tol`, should be much faster than defect supercell stencil pbar.update(15) # 55% of progress bar pbar.set_description("Ensuring matching orientation w/target_supercell") # now we just do get s2 like s1 to get the orientation right: defect_site = next(site for site in new_defect_supercell if site.specie.symbol == "X") # Note: this function is typically the main bottleneck in this workflow. We have already # optimised the underlying ``StructureMatcher`` workflow in many ways (caching, # fast structure/site/composition comparisons, skipping comparison of defect neighbourhood to # reduce requisite ``stol`` etc; being many orders of magnitude faster than the base # ``pymatgen`` ``StructureMatcher``), however the ``_cart_dists()`` function call is # still quite expensive, especially with large structures with significant noise in the atomic # positions... new_defect_supercell_w_defect_neighbours_as_X = _convert_defect_neighbours_to_X( new_defect_supercell, defect_site.frac_coords, coords_are_cartesian=False ) with warnings.catch_warnings(): warnings.filterwarnings("ignore", message="Not all sites have property orig_species") # first we orient the generated _bulk_ supercell to match the ``target_supercell``, # to try ensure consistency in the generated supercells oriented_new_bulk_supercell = orient_s2_like_s1( # speed should be >= defect orienting target_supercell, new_bulk_supercell, verbose=False, allow_subset=True, ) # return new_defect_supercell, _round_struct_coords(oriented_new_bulk_supercell, # dist_precision=0.01, to_unit_cell=True) oriented_new_defect_supercell_w_defect_neighbours_as_X = orient_s2_like_s1( oriented_new_bulk_supercell, new_defect_supercell_w_defect_neighbours_as_X, verbose=False, ignored_species=["X"], # ignore X site allow_subset=True, # allow defect supercell composition to differ ) oriented_new_defect_supercell = _convert_X_back_to_orig_species( oriented_new_defect_supercell_w_defect_neighbours_as_X ) oriented_new_defect_site = next( # get defect site and remove X from sites oriented_new_defect_supercell.pop(i) for i, site in enumerate(oriented_new_defect_supercell.sites) if site.specie.symbol == "X" ) pbar.update(35) # 90% of progress bar if target_frac_coords is not False: pbar.set_description("Placing defect closest to target_frac_coords") target_symm_op = _scan_symm_ops_to_place_site_closest_to_frac_coords( target_supercell, oriented_new_defect_site, target_frac_coords ) oriented_new_defect_supercell = get_clean_structure( apply_symm_op_to_struct( # apply symm_op to structure target_symm_op, oriented_new_defect_supercell, fractional=True, rotate_lattice=False, ), ) # reordered inputs in updated doped if check_bulk: bulk_mismatch_warning = not check_atom_mapping_far_from_defect( target_supercell, oriented_new_bulk_supercell, oriented_new_defect_site.frac_coords, warning=False, ) oriented_new_bulk_supercell = get_clean_structure( apply_symm_op_to_struct( target_symm_op, oriented_new_bulk_supercell, fractional=True, rotate_lattice=False, ), ) pbar.update(pbar.total - pbar.n) # set to 100% of progress bar except Exception as e: pbar.close() raise e finally: pbar.close() if bulk_mismatch_warning: # print warning after closing pbar; cleaner warnings.warn( "Note that the atomic position basis of the generated defect/bulk supercell " "differs from that of the ``target_supercell``. This is likely fine, " "and just due to differences in (symmetry-equivalent) primitive cell definitions " "(e.g. {'Cd': [0,0,0], 'Te': [0.25,0.25,0.25]} vs " "{'Cd(': [0,0,0], 'Te': [0.75,0.75,0.75]}``) -- see ``get_defect_in_supercell`` " "docstring for more info. For accurate finite-size charge corrections when " "parsing, a matching bulk and defect supercell should be used, and so the " "matching bulk supercell for the generated defect supercell (also returned " "by this function) should be used for its reference host cell calculation.", ) _check_min_dist(oriented_new_defect_supercell, orig_min_dist) # check distances are reasonable _check_min_dist(oriented_new_bulk_supercell, bulk_min_bond_length) return oriented_new_defect_supercell, oriented_new_bulk_supercell
def _scan_symm_ops_to_place_site_closest_to_frac_coords( symm_ops: Union[Structure, Sequence[SymmOp]], site: PeriodicSite, target_frac_coords: Optional[Union[np.ndarray[float], list[float]]] = None, ) -> SymmOp: """ Given either a list of symmetry operations or a structure (to extract symmetry operations from), scan over all symmetry operations to find the one which places the provided ``site`` closest to the target fractional coordinates. Args: symm_ops (Union[Structure, Sequence[SymmOp]]): Either a list of symmetry operations or a structure from which to extract symmetry operations. site (PeriodicSite): The site to place closest to the target fractional coordinates. target_frac_coords (Optional[Union[np.ndarray[float], list[float]]]): The target fractional coordinates to place the site closest to. Default is ``None``, in which case the site is placed closest to the centre of the supercell (i.e. [0.5, 0.5, 0.5]). """ if isinstance(symm_ops, Structure): sga = get_sga(symm_ops) symm_ops = sga.get_symmetry_operations() target_frac_coords = [0.5, 0.5, 0.5] if target_frac_coords is None else target_frac_coords # translate to put defect at closest possible site to target_frac_coords symm_op_pos_dict = {} for i, symm_op in enumerate(symm_ops): # should check if frac or cartesian is faster symm_opped_site = apply_symm_op_to_site(symm_op, site, fractional=True, rotate_lattice=False) symm_op_pos_dict[i] = symm_opped_site.to_unit_cell().frac_coords # get symm_op which puts defect closest to target_frac_coords: closest_site = min(symm_op_pos_dict.items(), key=lambda x: np.linalg.norm(x[1] - target_frac_coords)) return symm_ops[closest_site[0]] def _stencil_target_cell_from_big_cell( big_supercell: Structure, target_supercell: Structure, edge_tol: float = 1.0, bulk_min_bond_length: Optional[float] = None, orig_min_dist: Optional[float] = None, pbar: Optional[tqdm] = None, ) -> Structure: """ Given the input ``big_supercell`` and ``target_supercell`` (which should be fully encompassed by the former), stencil out the sites in ``big_supercell`` which correspond to the sites in ``target_supercell`` (i.e. are within the Cartesian bounds of ``target_supercell``). Note that this function assumes that the defect is roughly centred within ``big_supercell`` (i.e. near [0.5, 0.5, 0.5])! The midpoints of ``target_supercell`` and ``big_supercell`` are then aligned within this function, before stenciling. We need to ensure the appropriate number of sites (and their composition) are taken, and that the sites we choose are appropriate for the new supercell (i.e. that if we have e.g. an in-plane contraction, we don't take duplicate atoms that then correspond to tiny inter-atomic distances in the new supercell due to imperfect stenciling under PBC -- so we can't simply take the atoms that are closest to the defect). So, here we scan over possible choices of atoms to include, and take the combination which maximises the _minimum_ inter-atomic distance in the new supercell, when accounting for PBCs. Args: big_supercell (Structure): The supercell structure which fully encompasses ``target_supercell``, from which to stencil out the sites. target_supercell (Structure): The supercell structure giving the cell dimensions to stencil out from ``big_supercell``. edge_tol (float): A tolerance (in Angstrom) for site displacements at the edge of the stenciled supercell, when determining the best match of sites to stencil out in the new supercell (of ``target_supercell`` dimension). Default is 1 Angstrom, and then this is sequentially increased up to 4.5 Angstrom if the initial scan fails. bulk_min_bond_length (float): The minimum interatomic distance in the bulk supercell. Default is ``None``, in which case it is calculated from ``target_supercell``. orig_min_dist (float): The minimum interatomic distance in the original defect supercell. Default is ``None``, in which case it is calculated from ``big_supercell``. pbar (tqdm): ``tqdm`` progress bar object to update (for internal ``doped`` usage). Default is ``None``. Returns: Structure: The stenciled supercell structure. """ # first, translate sites to put the centre of big_supercell (which should have defect site at the # centre) to the centre of the target supercell (w/cart coords) target_supercell_midpoint_cart_coords = target_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) translation_from_big_defect_middle_to_target_middle = ( target_supercell_midpoint_cart_coords - big_supercell.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) ) big_supercell = translate_structure( big_supercell, translation_from_big_defect_middle_to_target_middle, frac_coords=False, to_unit_cell=False, ) if bulk_min_bond_length is None: bulk_min_bond_length = min_dist(target_supercell) if orig_min_dist is None: orig_min_dist = min_dist(big_supercell.copy().remove_species(["X"])) # get target composition accounting for defect presence in big supercell: num_sc = big_supercell.volume / target_supercell.volume # may be fractional big_supercell_comp_wout_X = big_supercell.copy().remove_species(["X"]).composition target_composition = big_supercell_comp_wout_X - ((num_sc - 1) * target_supercell.composition) target_composition = Composition({k: round(v) for k, v in target_composition.items()}) while edge_tol <= 4.5: # sequentially increase edge_tol by 0.5 Å, up to 4.5 Å, until match is found: try: ( def_new_supercell_sites, def_new_supercell_sites_to_check_in_target, candidate_sites_in_target, combo_composition, ) = _get_candidate_supercell_sites( big_supercell, target_supercell, target_composition, edge_tol ) num_sites_up_for_grabs = int(sum(combo_composition.values())) # number of sites to be placed candidate_sites_in_target = _remove_overlapping_sites( candidate_sites_in_target=candidate_sites_in_target, def_new_supercell_sites_to_check_in_target=def_new_supercell_sites_to_check_in_target, big_supercell_defect_coords=target_supercell_midpoint_cart_coords, bulk_min_bond_length=bulk_min_bond_length, pbar=pbar, ) if len(candidate_sites_in_target) < num_sites_up_for_grabs: raise RuntimeError( f"Too little candidate sites ({len(candidate_sites_in_target)}) to match target " f"composition ({num_sites_up_for_grabs} sites to be placed). Aborting." ) num_combos = math.comb(len(candidate_sites_in_target), num_sites_up_for_grabs) if num_combos > 1e10: raise RuntimeError( "Far too many possible site combinations to check, indicating a code failure. " "Aborting, please report this case to the developers!" ) if pbar is not None: pbar.set_description( f"Calculating best match (edge_tol = {edge_tol} Å, possible combos = {num_combos})" ) # 40% of pbar species_symbols = [site.specie.symbol for site in candidate_sites_in_target] min_interatomic_distances_tuple_combo_dict = {} idx_combos = list(combinations(range(len(candidate_sites_in_target)), num_sites_up_for_grabs)) if idx_combos and idx_combos != [()]: for idx_combo in idx_combos: if _Composition__eq__( _cached_Composition_init( Hashabledict(Counter([species_symbols[i] for i in idx_combo])) ), combo_composition, ): # could early break cases where the distances are too small? if a bottleneck, # currently not. And/or could loop over subsets of each possible combo first, # culling any which break this fake_candidate_struct_sites = def_new_supercell_sites_to_check_in_target + [ candidate_sites_in_target[i] for i in idx_combo ] frac_coords = [site.frac_coords for site in fake_candidate_struct_sites] distance_matrix = target_supercell.lattice.get_all_distances( frac_coords, frac_coords ) sorted_distances = np.sort(distance_matrix.flatten()) min_interatomic_distances_tuple_combo_dict[ tuple(sorted_distances[len(frac_coords) : len(frac_coords) + 10]) # take the 10 smallest distances, in case defect site is smallest distance ] = [candidate_sites_in_target[i] for i in idx_combo] # get the candidate structure with the largest minimum interatomic distance: min_interatomic_distances_tuple_combo_list = sorted( min_interatomic_distances_tuple_combo_dict.items(), key=lambda x: x[0], reverse=True, ) new_supercell_sites = def_new_supercell_sites + list( min_interatomic_distances_tuple_combo_list[0][1] ) else: new_supercell_sites = def_new_supercell_sites new_supercell = Structure( lattice=target_supercell.lattice, species=[site.specie for site in new_supercell_sites], coords=[site.coords for site in new_supercell_sites], coords_are_cartesian=True, to_unit_cell=True, ) new_supercell_w_defect_comp = new_supercell.copy() new_supercell_w_defect_comp.remove_species("X") # raise RuntimeError and dynamically increase edge_tol if resulting min_dist too small: _check_min_dist(new_supercell_w_defect_comp, orig_min_dist, warning=False) try: if target_composition != target_supercell.composition: # defect, not bulk defect_type, comp_diff = get_defect_type_and_composition_diff( target_supercell, new_supercell_w_defect_comp ) break # match found! except ValueError as e: raise RuntimeError("Incorrect defect cell obtained. Aborting.") from e except RuntimeError as e: edge_tol += 0.5 if edge_tol > 4.5: raise e if pbar is not None: pbar.n = 20 # decrease pbar progress back to 20% pbar.refresh() pbar.set_description(f"Trying edge_tol = {edge_tol} Å") continue return new_supercell def _convert_defect_neighbours_to_X( defect_supercell: Structure, defect_position: np.ndarray[float], coords_are_cartesian: bool = False, ws_radius_fraction: float = 0.5, ) -> Structure: """ Convert all neighbouring sites of a defect site in a supercell, within half the Wigner-Seitz radius, to have their species as "X" (storing their original species in the site property dict as "orig_species"). Intended to then be used to make structure-matching far more efficient, by ignoring the highly-perturbed defect neighbourhood (which requires larger ``stol`` values which grealy slow down structure-matching). Args: defect_supercell (Structure): The defect supercell to edit. defect_position (np.ndarray[float]): The coordinates of the defect site, either fractional or cartesian depending on ``coords_are_cartesian``. coords_are_cartesian (bool): Whether the defect position is in cartesian coordinates. Default is ``False`` (fractional coordinates). ws_radius_fraction (float): The fraction of the Wigner-Seitz radius to use as the cut-off distance for neighbouring sites to convert to species "X". Default is 0.5 (50%). Returns: Structure: The supercell structure with the defect site and all neighbouring sites converted to species X. """ converted_defect_supercell = defect_supercell.copy() ws_radius = get_wigner_seitz_radius(defect_supercell) defect_frac_coords = ( defect_position if not coords_are_cartesian else defect_supercell.lattice.get_fractional_coords(defect_position) ) site_indices_to_convert = np.where( # vectorised for fast computation defect_supercell.lattice.get_all_distances( defect_supercell.frac_coords, defect_frac_coords ).ravel() < np.max((ws_radius * ws_radius_fraction, 1)) )[0] for i, orig_site in enumerate(defect_supercell): if i in site_indices_to_convert: converted_defect_supercell.replace(i, "X") # we set properties for all sites, because ``Structure.from_sites()`` in ``pymatgen``'s # ``StructureMatcher`` adds properties to all sites, which can then mess with site comparisons... converted_defect_supercell[i].properties["orig_species"] = orig_site.specie.symbol return converted_defect_supercell def _convert_X_back_to_orig_species(converted_defect_supercell: Structure) -> Structure: """ Convert all sites in a supercell with species "X" and "orig_species" in the site property dict back to their original species. Mainly intended just for internal ``doped`` usage, to convert back sites which had been converted to "X" for efficient structure-matching (see ``_convert_defect_neighbours_to_X``). Args: converted_defect_supercell (Structure): The defect supercell to convert back. Returns: Structure: The supercell structure with all sites with species "X" and "orig_species" in the site property dict converted back to their original species. """ defect_supercell = converted_defect_supercell.copy() for i, site in enumerate(defect_supercell): if site.specie.symbol == "X" and site.properties.get("orig_species"): defect_supercell.replace(i, site.properties.pop("orig_species", site.specie.symbol)) return defect_supercell def _check_min_dist( structure: Structure, orig_min_dist: float = 5.0, warning: bool = True, ignored_species: Optional[list[str]] = None, ) -> None: """ Helper function to check if the minimum interatomic distance in the provided ``structure`` are reasonable. Args: structure (Structure): The structure to check. orig_min_dist (float): The minimum interatomic distance in the original structure. If the minimum interatomic distance in the new structure is smaller than this, and smaller than a reasonable minimum distance (0.65 Å if H in structure, else 1.0 Å), a warning or error is raised. Default is 5.0 Å. warning (bool): Whether to raise a warning or an error if the minimum interatomic distance is too small. Default is ``True`` (warning). ignored_species (list[str]): A list of species symbols to ignore when calculating the minimum interatomic distance. Default is ``None``. """ H_in_struct = any(site for site in structure.sites if site.specie.symbol == "H") reasonable_min_dist = 0.65 if H_in_struct else 1.0 struct_min_dist = min_dist(structure, ignored_species) if struct_min_dist < min(orig_min_dist, reasonable_min_dist): message = ( f"Generated structure has a minimum interatomic distance of {struct_min_dist:.2f} Å, smaller " f"than the original defect supercell ({orig_min_dist:.2f} Å), which may be unreasonable. " f"Please check if this minimum distance and structure make sense, and if not please report " f"this issue to the developers!" ) if warning: warnings.warn(message) else: raise RuntimeError(message) def _get_candidate_supercell_sites( big_supercell: Structure, target_supercell: Structure, target_composition: Composition, edge_tol: float = 1.0, ) -> tuple[list[PeriodicSite], list[PeriodicSite], list[PeriodicSite], Composition]: """ Get all atoms in ``big_supercell`` which are within the Cartesian bounds of the ``target_supercell`` supercell. We need to ensure the same number of sites, and that the sites we choose are appropriate for the new supercell (i.e. that if we have e.g. an in-plane contraction, we don't take duplicate atoms that then correspond to tiny inter-atomic distances in the new supercell due to imperfect stenciling under PBC -- so we can't just take the atoms that are closest to the defect). So, this function determines the possible sites to include in the new supercell, the sites which are definitely in the target supercell, and the sites which are near the bordering regions of the target supercell and so may or may not be included (i.e. ``candidate_sites_in_target``), using the provided ``edge_tol``. Note that this function assumes that the defect (or any significant atomic displacements) is (are) roughly centred within ``big_supercell`` (i.e. near [0.5, 0.5, 0.5])! Args: big_supercell (Structure): The super-supercell with a single defect supercell and rest of the sites populated by the bulk supercell. target_supercell (Structure): The supercell structure to re-generate the relaxed defect structure in. target_composition (Composition): The composition of the target supercell. edge_tol (float): A tolerance (in Angstrom) for site displacements at the edge of the ``target_supercell`` supercell, when determining the best match of sites to stencil out in the new supercell. Default is 1 Angstrom. Returns: tuple[list[PeriodicSite], list[PeriodicSite], list[PeriodicSite], Composition, int]: - ``def_new_supercell_sites``: List of sites in the super-supercell which are within the bounds of the target supercell (minus ``edge_tol``) - ``def_new_supercell_sites_to_check_in_target``: List of sites in the super-supercell which are near the bordering regions of the target supercell (within ``edge_tol*2`` of the target cell edge). - ``candidate_sites_in_target``: List of candidate sites in the target supercell to check if overlapping with each other or ``def_new_supercell_sites_to_check_in_target``. - ``combo_composition``: The composition we need to add to the target supercell. """ # Note: This could possibly be made faster by reducing `edge_tol` when the `orig_supercell` is # fully encompassed by `target_supercell` (meaning there should be no defect-induced # displacements at the stenciled cell edges here), by getting the minimum encompassing cube # length (sphere diameter), and seeing if this is smaller than the largest sphere (diameter) # which can be inscribed in the target supercell # either way, this portion of the workflow is not a bottleneck (rather `get_orientation..` is) possible_new_supercell_sites = [ site for site in big_supercell.sites if is_within_frac_bounds(target_supercell.lattice, site.coords, tol=edge_tol) ] # tolerance of 2 Angstrom displacement at cell edges def_new_supercell_sites = [ site for site in possible_new_supercell_sites if is_within_frac_bounds(target_supercell.lattice, site.coords, tol=-edge_tol) ] # at least 2 Angstrom inside cell edges def_new_supercell_sites_to_check_in_target = [ PeriodicSite(site.specie, site.coords, lattice=target_supercell.lattice, coords_are_cartesian=True) for site in def_new_supercell_sites if not is_within_frac_bounds(target_supercell.lattice, site.coords, tol=-edge_tol * 2) ] candidate_sites_in_target = [ PeriodicSite(site.specie, site.coords, lattice=target_supercell.lattice, coords_are_cartesian=True) for site in possible_new_supercell_sites if site not in def_new_supercell_sites ] # target additional composition: def_new_supercell_sites_struct = Structure.from_sites(def_new_supercell_sites) def_new_supercell_sites_struct.remove_species("X") def_new_supercell_sites_comp = def_new_supercell_sites_struct.composition combo_composition = target_composition - def_new_supercell_sites_comp # composition we need to add # def_new_supercell_sites also has X in it return ( def_new_supercell_sites, def_new_supercell_sites_to_check_in_target, candidate_sites_in_target, combo_composition, ) def _remove_overlapping_sites( candidate_sites_in_target: list[PeriodicSite], def_new_supercell_sites_to_check_in_target: list[PeriodicSite], big_supercell_defect_coords: np.ndarray[float], bulk_min_bond_length: Optional[float] = None, pbar: tqdm = None, ) -> list[PeriodicSite]: """ Remove sites in ``candidate_sites_in_target`` which overlap either with each other (within 50% of the bulk bond length; in which case the site closest to the defect coords is removed) or with sites in ``def_new_supercell_sites_to_check_in_target`` (within 50% of the bulk bond length). Args: candidate_sites_in_target (list[PeriodicSite]): List of candidate sites in the target supercell to check if overlapping with each other or ``def_new_supercell_sites_to_check_in_target``. def_new_supercell_sites_to_check_in_target (list[PeriodicSite]): List of sites that are in the target supercell but are near the bordering regions, so are used to check for overlapping. big_supercell_defect_coords (np.ndarray[float]): *Cartesian* coordinates of the defect site in the big supercell. Used to choose between overlapping sites (favouring those which have a larger distance from the defect site). bulk_min_bond_length (float): The minimum bond length in the bulk supercell, used to check if inter-site distances are reasonable. If ``None`` (default), determined automatically from ``candidate_sites_in_target``. pbar (tqdm): ``tqdm`` progress bar object to update (for internal ``doped`` usage). Default is ``None``. Returns: list[PeriodicSite]: The list of candidate sites in the target supercell which do not overlap with each other or with ``def_new_supercell_sites_to_check_in_target``. """ if not candidate_sites_in_target: if pbar is not None: pbar.update(20) return candidate_sites_in_target if bulk_min_bond_length is None: bulk_min_bond_length = min_dist(Structure.from_sites(candidate_sites_in_target)) # scan over all possible combinations of num_sites_up_for_grabs sites in candidate_sites_in_target: check_other_candidate_sites_first = len(candidate_sites_in_target) < len( def_new_supercell_sites_to_check_in_target ) # check smaller list first for efficiency overlapping_site_indices: list[int] = [] # using indices as faster for comparing than actual sites _pbar_increment_per_iter = max( 0, 20 / len(candidate_sites_in_target) - 0.0001 ) # up to 20% of progress bar def _check_other_sites( idx, candidate_sites_in_target, overlapping_site_indices, big_supercell_defect_coords, bulk_min_bond_length, ): for other_idx, other_site in enumerate(candidate_sites_in_target): if ( idx == other_idx or other_idx in overlapping_site_indices or candidate_site.specie.symbol != other_site.specie.symbol ): continue if candidate_site.distance(other_site) < bulk_min_bond_length * 0.5: # if distance is less than 50% of bulk bond length, add the site with smaller # distance from defect to overlapping_sites (i.e. taking the site with larger # distance to defect as a remaining candidate site) overlapping_site_indices.append( min( [(idx, candidate_site), (other_idx, other_site)], key=lambda x: x[1].distance_from_point(big_supercell_defect_coords), )[0] ) return overlapping_site_indices for idx, candidate_site in list(enumerate(candidate_sites_in_target)): if pbar is not None: pbar.update(_pbar_increment_per_iter) if idx in overlapping_site_indices: continue if check_other_candidate_sites_first: overlapping_site_indices = _check_other_sites( idx, candidate_sites_in_target, overlapping_site_indices, big_supercell_defect_coords, bulk_min_bond_length, ) for site in def_new_supercell_sites_to_check_in_target: if candidate_site.distance(site) < bulk_min_bond_length * 0.5: overlapping_site_indices.append(idx) break if idx in overlapping_site_indices: continue if not check_other_candidate_sites_first: overlapping_site_indices = _check_other_sites( idx, candidate_sites_in_target, overlapping_site_indices, big_supercell_defect_coords, bulk_min_bond_length, ) return [site for i, site in enumerate(candidate_sites_in_target) if i not in overlapping_site_indices] def _get_matching_sites_from_s1_then_s2( template_struct: Structure, struct1_pool: Structure, struct2_pool: Structure, orig_min_dist: float = 5.0, ) -> Structure: """ Generate a stenciled structure from a template sub-set structure and two pools of sites/structures. Given two pools of sites/structures, returns a new structure which has (1) the closest site in ``struct1_pool`` to each site in ``template_struct``, and (2) ``struct2_pool.num_sites - struct2.num_sites`` sites from ``struct2_pool``, chosen as those with the largest minimum distances from sites in the first set of sites (i.e. the single defect subcell matching ``template_struct``). The targeted use case is transforming a large periodically-repeated super-supercell of a defect supercell (``struct1_pool``) into the same super-supercell but with only one copy of the original defect supercell (``template_struct``), and the rest of the sites populated by the bulk super-supercell (``struct2_pool``; with ``struct2`` being the original bulk supercell). Args: template_struct (Structure): The template structure to match. struct1_pool (Structure): The first pool of sites to match to the template structure. struct2_pool (Structure): The second pool of sites to match to the template structure. orig_min_dist (float): The minimum interatomic distance in the original (bulk) structure. Used to sanity check the output; if the minimum interatomic distance in the templated structure is smaller than this, and smaller than a reasonable minimum distance (0.65 Å if H in structure, else 1.0 Å), an error is raised. Default is 5.0 Å. Returns: Structure: The stenciled structure. """ num_super_supercells = len(struct1_pool) // len(template_struct) # both also have X single_defect_subcell_sites = [] species_coord_dict = {} # avoid recomputing coords for each site species_idx_dict = {} for element in _fast_get_composition_from_sites(struct1_pool).elements: species_coord_dict[element.symbol], species_idx_dict[element.symbol] = ( get_coords_and_idx_of_species(struct1_pool, element.symbol, frac_coords=False) ) for ( sub_site ) in template_struct.sites: # get closest site in big supercell to site, using cartesian coords: closest_site_dict_idx = np.argmin( np.linalg.norm(species_coord_dict[sub_site.specie.symbol] - sub_site.coords, axis=1), ) single_defect_subcell_sites.append( struct1_pool[species_idx_dict[sub_site.specie.symbol][closest_site_dict_idx]] ) assert len(set(single_defect_subcell_sites)) == len(single_defect_subcell_sites) # no repeats species_coord_dict = {} # avoid recomputing coords for each site for element in _fast_get_composition_from_sites(struct2_pool).elements: species_coord_dict[element.symbol] = get_coords_and_idx_of_species( single_defect_subcell_sites, element.symbol, frac_coords=True # frac_coords )[0] struct2_pool_dists_to_template_centre = struct2_pool.lattice.get_all_distances( struct2_pool.frac_coords, struct2_pool.lattice.get_fractional_coords( template_struct.lattice.get_cartesian_coords([0.5, 0.5, 0.5]) ), ).ravel() # template centre is defect site in stenciling workflow largest_encompassed_cube_length = _largest_cube_length_from_matrix(template_struct.lattice.matrix) candidate_struct2_pool_species_sites: dict[str, list[PeriodicSite]] = { super_site.specie.symbol: [] for super_site in struct2_pool } for dist_to_template_centre, super_site in zip(struct2_pool_dists_to_template_centre, struct2_pool): # screen to sites outside defect WS radius, for efficiency: if dist_to_template_centre > largest_encompassed_cube_length * 0.49: # 2% buffer (cube length / 2) candidate_struct2_pool_species_sites[super_site.specie.symbol].append(super_site) struct2_pool_site_min_dist_dict = {} for species_symbol, species_sites in candidate_struct2_pool_species_sites.items(): dist_matrix = struct2_pool.lattice.get_all_distances( # vectorised for fast computation species_coord_dict[species_symbol], [site.frac_coords for site in species_sites] ) # M x N min_dists = np.min(dist_matrix, axis=0) # down columns struct2_pool_site_min_dist_dict.update(dict(zip(species_sites, min_dists))) # sort possible_bulk_outer_cell_sites by (largest) min dist to single_defect_subcell_sites: possible_bulk_outer_cell_sites = sorted( [site for sites in candidate_struct2_pool_species_sites.values() for site in sites], key=lambda x: struct2_pool_site_min_dist_dict[x], reverse=True, ) bulk_outer_cell_sites = possible_bulk_outer_cell_sites[ : int(len(struct2_pool) * (1 - 1 / num_super_supercells)) ] return Structure.from_sites(single_defect_subcell_sites + bulk_outer_cell_sites) def _get_superset_matrix_and_supercells( structure: Structure, target_supercell: Structure, defect_frac_coords: Optional[Union[np.ndarray[float], list[float]]] = None, ) -> tuple: # tuple[np.ndarray[int], Structure, Structure, Structure] or tuple[np.ndarray[int], Structure] """ Given a structure and a target supercell, return the transformation ('superset') matrix which makes all lattice vectors for the structure larger than or equal to the largest lattice vector in ``target_supercell``. Args: structure (Structure): The original structure for which to get the superset matrix that fully encompasses the target supercell. target_supercell (Structure): The target supercell. defect_frac_coords (Optional: Union[np.ndarray[float], list[float]]): The fractional coordinates of a defect site in the structure. If provided, will add an "X" marker (fake species) to this site in a copy of ``structure``, and additionally return the big supercell with copies of "X", and the original structure with "X". Returns: Union[tuple[np.ndarray[int], Structure, Structure, Structure], tuple[np.ndarray[int]]: - If ``defect_frac_coords`` is not provided, returns a tuple containing the superset matrix and the big supercell. - If ``defect_frac_coords`` is provided, returns a tuple containing the superset matrix, the big supercell, the big supercell with _repeated_ defect sites marked by "X", and a copy of the original structure with the defect site as "X". """ min_cell_length = _get_all_encompassing_cube_length(target_supercell.lattice) # get supercell matrix which makes all lattice vectors for orig_supercell larger than min_cell_length: superset_matrix = np.ceil(min_cell_length / structure.lattice.abc) # could possibly use non-diagonal supercells to make this more efficient in some cases, but a lot of # work, shouldn't really contribute much to slowdowns, and only relevant in some rare cases (?) big_supercell = structure * superset_matrix if defect_frac_coords is None: return superset_matrix, big_supercell structure_with_X = structure.copy() # get defect coords in big supercell: structure_with_X.append("X", defect_frac_coords, coords_are_cartesian=False) big_supercell_with_X = structure_with_X * superset_matrix return superset_matrix, big_supercell, big_supercell_with_X, structure_with_X def _get_all_encompassing_cube_length(lattice: Lattice) -> float: """ Get the smallest possible cube that fully encompasses the cell, _regardless of orientation_. This is determined by getting the 8 vertices of the cell and computing the max distance between any two vertices, giving the side length of an all- encompassing cube. Equivalent to getting the diameter of an encompassing sphere. Args: lattice (Lattice): The lattice to get the all-encompassing cube length for. Returns: float: The side length of the all-encompassing cube. """ # get all vertices of the cell: cart_vertices = np.array([lattice.get_cartesian_coords(perm) for perm in product([0, 1], repeat=3)]) # get 2D matrix of all distances between vertices: distances = np.linalg.norm(cart_vertices[:, None] - cart_vertices, axis=-1) return max(distances.flatten())
[docs] def is_within_frac_bounds( lattice: Lattice, cart_coords: Union[np.ndarray[float], list[float]], tol: float = 1e-5 ) -> bool: """ Check if a given Cartesian coordinate is inside the unit cell defined by the lattice object. Args: lattice (Lattice): ``Lattice`` object defining the unit cell. cart_coords (Union[np.ndarray[float], list[float]]): The Cartesian coordinates to check. tol (float): A tolerance (in Angstrom / cartesian units) for coordinates to be considered within the unit cell. If positive, expands the bounds of the unit cell by this amount, if negative, shrinks the bounds. Returns: bool: Whether the Cartesian coordinates are within the fractional bounds of the unit cell, accounting for ``tol``. """ frac_coords = lattice.get_fractional_coords(cart_coords) frac_tols = np.array([tol, tol, tol]) / lattice.abc # Check if fractional coordinates are in the range [0, 1) return np.all((frac_coords + frac_tols >= 0) & (frac_coords - frac_tols < 1))