Skip to content

Core module

Provides the Embedding class.

This module enables the user load in elemetal representation data and analyse it using statistical functions.

Typical usage example

megnet16 = Embedding.load_data('megnet16')

Embedding

Represent an elemental representation.

To load an embedding distributed from the package use the load_data() method.

Works like a standard python dictionary. The keys are {element: vector} pairs.

Adds a few convenience methods related to elemental representations.

Source code in src/elementembeddings/core.py
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
class Embedding:
    """
    Represent an elemental representation.

    To load an embedding distributed from the package use the load_data() method.

    Works like a standard python dictionary. The keys are {element: vector} pairs.

    Adds a few convenience methods related to elemental representations.
    """

    def __init__(
        self,
        embeddings: dict,
        embedding_name: Optional[str] = None,
        feature_labels: Optional[List[str]] = None,
    ):
        """Initialise the Embedding class.

        Args:
            embeddings (dict): A {element_symbol: vector} dictionary
            embedding_name (str): The name of the elemental representation
            feature_labels (list(str)): A list of feature labels
        """
        self.embeddings = embeddings
        self.embedding_name = embedding_name
        self.feature_labels = feature_labels
        if not self._is_standardised():
            self.is_standardised = False
        else:
            self.is_standardised = True

        # Grab a random value from the embedding vector
        _rand_embed = random.choice(list(self.embeddings.values()))
        # Convert embeddings to numpy array if not already a numpy array
        if not isinstance(_rand_embed, np.ndarray):
            self.embeddings = {
                ele: np.array(self.embeddings[ele]) for ele in self.embeddings
            }

        # Determines if the embedding vector has a length attribute
        # (i.e. is not a scalar int or float)
        # If the 'vector' is a scalar/float, the representation is linear
        # A linear representation gets converted to a one-hot vector
        if hasattr(_rand_embed, "__len__") and (not isinstance(_rand_embed, str)):
            self.embedding_type: str = "vector"
            self.dim: int = len(random.choice(list(self.embeddings.values())))
        else:
            self.embedding_type: str = "linear"

        # Create one-hot vectors for a scalar representation
        if self.embedding_type == "linear":
            sorted_embedding = sorted(self.embeddings.items(), key=lambda x: x[1])
            elements = np.loadtxt(
                f"{data_directory}/element_data/ordered_periodic.txt", dtype=str
            )
            if self.embedding_name == "mod_petti":
                sorted_embedding = {
                    el: num for el, num in sorted_embedding if el in elements[:103]
                }
            else:
                sorted_embedding = {
                    el: num for el, num in sorted_embedding if el in elements[:118]
                }
            self.feature_labels = list(sorted_embedding.keys())
            self.embeddings = {}

            for el, num in sorted_embedding.items():
                self.embeddings[el] = np.zeros(len(sorted_embedding))
                self.embeddings[el][num] = 1
            self.dim = len(random.choice(list(self.embeddings.values())))

        if not self.feature_labels:
            self.feature_labels = list(range(self.dim))
        else:
            self.feature_labels = self.feature_labels

        # Dummy initialisation for results
        self._data = []
        self._pca_data = None  # type: Optional[np.ndarray]
        self._tsne_data = None  # type: Optional[np.ndarray]
        self._umap_data = None  # type: Optional[np.ndarray]

    @staticmethod
    def load_data(embedding_name: Optional[str] = None):
        """
        Create an instance of the `Embedding` class from a default embedding file.

        The default embeddings are in the table below:

        | **Name**                | **str_name** |
        |-------------------------|--------------|
        | Magpie                  | magpie       |
        | Magpie (scaled)         | magpie_sc    |
        | Mat2Vec                 | mat2vec      |
        | Matscholar              | matscholar   |
        | Megnet (16 dimensions)  | megnet16     |
        | Modified pettifor scale | mod_petti    |
        | Oliynyk                 | oliynyk      |
        | Oliynyk (scaled)        | oliynyk_sc   |
        | Random (200 dimensions) | random_200   |
        | SkipAtom                | skipatom     |
        | Atomic Number           | atomic       |


        Args:
            embedding_name (str): The str_name of an embedding file.

        Returns:
            Embedding :class:`Embedding` instance.
        """
        _cbfv_files = {
            "magpie": "magpie.csv",
            "magpie_sc": "magpie_sc.json",
            "mat2vec": "mat2vec.csv",
            "matscholar": "matscholar-embedding.json",
            "megnet16": "megnet16.json",
            "mod_petti": "mod_petti.json",
            "oliynyk": "oliynyk_preprocessed.csv",
            "oliynyk_sc": "oliynyk_sc.json",
            "random_200": "random_200_new.csv",
            "skipatom": "skipatom_20201009_induced.csv",
            "atomic": "atomic.json",
        }

        if _cbfv_files[embedding_name].endswith(".csv"):
            return Embedding.from_csv(
                path.join(
                    data_directory,
                    "element_representations",
                    _cbfv_files[embedding_name],
                ),
                embedding_name,
            )
        elif "megnet" in _cbfv_files[embedding_name]:
            return Embedding.from_json(
                path.join(
                    data_directory,
                    "element_representations",
                    _cbfv_files[embedding_name],
                ),
                embedding_name,
            ).remove_elements(["Null"])
        elif _cbfv_files[embedding_name].endswith(".json"):
            return Embedding.from_json(
                path.join(
                    data_directory,
                    "element_representations",
                    _cbfv_files[embedding_name],
                ),
                embedding_name,
            )

    @staticmethod
    def from_json(embedding_json, embedding_name: Optional[str] = None):
        """
        Create an instance of the Embedding class from a json file.

        Args:
            embedding_json (str): Filepath of the json file
            embedding_name (str): The name of the elemental representation
        """
        # Need to add validation handling for JSONs in different formats
        with open(embedding_json) as f:
            embedding_data = json.load(f)
        return Embedding(embedding_data, embedding_name)

    @staticmethod
    def from_csv(embedding_csv, embedding_name: Optional[str] = None):
        """
        Create an instance of the Embedding class from a csv file.

        The first column of the csv file must contain the elements and be named element.

        Args:
            embedding_csv (str): Filepath of the csv file
            embedding_name (str): The name of the elemental representation

        """
        # Need to add validation handling for csv files
        df = pd.read_csv(embedding_csv)
        elements = list(df["element"])
        df.drop(["element"], axis=1, inplace=True)
        feature_labels = list(df.columns)
        embeds_array = df.to_numpy()
        embedding_data = {
            elements[i]: embeds_array[i] for i in range(len(embeds_array))
        }
        return Embedding(embedding_data, embedding_name, feature_labels)

    def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
        """
        Return the embedding as a pandas Dataframe.

        The first column is the elements and each other
        column represents a component of the embedding.

        Args:
            columns (str): A string to specify if the columns are the vector components
            and the index is the elements (`columns='components'`)
            or the columns are the elements (`columns='elements'`).

        Returns:
            df (pandas.DataFrame): A pandas dataframe object


        """
        embedding = self.embeddings
        df = pd.DataFrame(embedding, index=self.feature_labels)
        if columns == "components":
            return df.T
        elif columns == "elements":
            return df
        else:
            raise (
                ValueError(
                    f"{columns} is not a valid keyword argument. "
                    "Choose either 'components' or 'elements"
                )
            )

    def to(self, fmt: str = "", filename: Optional[str] = ""):
        """
        Output the embedding to a file.

        Args:
            fmt (str): The file format to output the embedding to.
            Options include "json" and "csv".
            filename (str): The name of the file to be outputted
        Returns:
            (str) if filename not specified, otherwise None.
        """
        fmt = fmt.lower()

        if fmt == "json" or fnmatch.fnmatch(filename, "*.json"):
            j = json.dumps(self.embeddings, cls=NumpyEncoder)
            if filename:
                if not filename.endswith(".json"):
                    filename = filename + ".json"
                with open(filename, "w") as file:
                    file.write(j)
            else:
                return j
        elif fmt == "csv" or fnmatch.fnmatch(filename, "*.csv"):
            if filename:
                if not filename.endswith(".csv"):
                    filename = filename + ".csv"
                self.as_dataframe().to_csv(filename, index_label="element")
            else:
                return self.as_dataframe().to_csv(index_label="element")

        else:
            raise ValueError(f"{str(fmt)} is an invalid file format")

    @property
    def element_list(self) -> list:
        """Return the elements of the embedding."""
        return list(self.embeddings.keys())

    def remove_elements(self, elements: Union[str, List[str]], inplace: bool = False):
        # TO-DO allow removal by atomic numbers
        """
        Remove elements from the Embedding instance.

        Args:
            elements (str,list(str)): An element symbol or a list of element symbols
            inplace (bool): If True, elements are removed from the Embedding instance.
            If false, the original embedding instance is unchanged
            and a new embedding instance with the elements removed is created.

        """
        if inplace:
            if isinstance(elements, str):
                del self.embeddings[elements]
            elif isinstance(elements, list):
                for el in elements:
                    del self.embeddings[el]
            return None
        else:
            embeddings_copy = self.embeddings.copy()
            if isinstance(elements, str):
                del embeddings_copy[elements]
            elif isinstance(elements, list):
                for el in elements:
                    del embeddings_copy[el]
            return Embedding(embeddings_copy, self.embedding_name)

    def _is_standardised(self):
        """Check if the embeddings are standardised.

        Mean must be 0 and standard deviation must be 1.
        """
        return np.isclose(
            np.mean(np.array(list(self.embeddings.values()))), 0
        ) and np.isclose(np.std(np.array(list(self.embeddings.values()))), 1)

    def standardise(self, inplace: bool = False):
        """Standardise the embeddings.

        Mean is 0 and standard deviation is 1.

        """
        if self._is_standardised():
            warnings.warn(
                "Embedding is already standardised. "
                "Returning None and not changing the embedding."
            )
            return None
        else:
            embeddings_copy = self.embeddings.copy()
            embeddings_array = np.array(list(embeddings_copy.values()))
            embeddings_array = StandardScaler().fit_transform(embeddings_array)
            for el, emb in zip(embeddings_copy.keys(), embeddings_array):
                embeddings_copy[el] = emb

            if inplace:
                self.embeddings = embeddings_copy
                self.is_standardised = True
                return None
            else:
                return Embedding(embeddings_copy, self.embedding_name)

    def citation(self) -> List[str]:
        """Return a citation for the embedding."""
        if self.embedding_name in ["magpie", "magpie_sc"]:
            citation = [
                "@article{ward2016general,"
                "title={A general-purpose machine learning framework for "
                "predicting properties of inorganic materials},"
                "author={Ward, Logan and Agrawal, Ankit and Choudhary, Alok "
                "and Wolverton, Christopher},"
                "journal={npj Computational Materials},"
                "volume={2},"
                "number={1},"
                "pages={1--7},"
                "year={2016},"
                "publisher={Nature Publishing Group}}"
            ]
        elif self.embedding_name == "mat2vec":
            citation = [
                "@article{tshitoyan2019unsupervised,"
                "title={Unsupervised word embeddings capture latent knowledge "
                "from materials science literature},"
                "author={Tshitoyan, Vahe and Dagdelen, John and Weston, Leigh "
                "and Dunn, Alexander and Rong, Ziqin and Kononova, Olga "
                "and Persson, Kristin A and Ceder, Gerbrand and Jain, Anubhav},"
                "journal={Nature},"
                "volume={571},"
                "number={7763},"
                "pages={95--98},"
                "year={2019},"
                "publisher={Nature Publishing Group} }"
            ]
        elif self.embedding_name == "matscholar":
            citation = [
                "@article{weston2019named,"
                "title={Named entity recognition and normalization applied to "
                "large-scale information extraction from the materials "
                "science literature},"
                "author={Weston, Leigh and Tshitoyan, Vahe and Dagdelen, John and "
                "Kononova, Olga and Trewartha, Amalie and Persson, Kristin A and "
                "Ceder, Gerbrand and Jain, Anubhav},"
                "journal={Journal of chemical information and modeling},"
                "volume={59},"
                "number={9},"
                "pages={3692--3702},"
                "year={2019},"
                "publisher={ACS Publications} }"
            ]

        elif self.embedding_name == "megnet16":
            citation = [
                "@article{chen2019graph,"
                "title={Graph networks as a universal machine learning framework "
                "for molecules and crystals},"
                "author={Chen, Chi and Ye, Weike and Zuo, Yunxing and "
                "Zheng, Chen and Ong, Shyue Ping},"
                "journal={Chemistry of Materials},"
                "volume={31},"
                "number={9},"
                "pages={3564--3572},"
                "year={2019},"
                "publisher={ACS Publications} }"
            ]

        elif self.embedding_name in ["oliynyk", "oliynyk_sc"]:
            citation = [
                "              @article{oliynyk2016high,"
                "title={High-throughput machine-learning-driven synthesis "
                "of full-Heusler compounds},"
                "author={Oliynyk, Anton O and Antono, Erin and Sparks, Taylor D and "
                "Ghadbeigi, Leila and Gaultois, Michael W and "
                "Meredig, Bryce and Mar, Arthur},"
                "journal={Chemistry of Materials},"
                "volume={28},"
                "number={20},"
                "pages={7324--7331},"
                "year={2016},"
                "publisher={ACS Publications} }"
            ]

        elif self.embedding_name == "skipatom":
            citation = [
                "@article{antunes2022distributed,"
                "title={Distributed representations of atoms and materials "
                "for machine learning},"
                "author={Antunes, Luis M and Grau-Crespo, Ricardo and Butler, Keith T},"
                "journal={npj Computational Materials},"
                "volume={8},"
                "number={1},"
                "pages={1--9},"
                "year={2022},"
                "publisher={Nature Publishing Group} }"
            ]
        elif self.embedding_name == "mod_petti":
            citation = [
                "@article{glawe2016optimal,"
                "title={The optimal one dimensional periodic table: "
                "a modified Pettifor chemical scale from data mining},"
                "author={Glawe, Henning and Sanna, Antonio and Gross, "
                "EKU and Marques, Miguel AL},"
                "journal={New Journal of Physics},"
                "volume={18},"
                "number={9},"
                "pages={093011},"
                "year={2016},"
                "publisher={IOP Publishing} }"
            ]

        else:
            citation = []

        return citation

    def _is_el_in_embedding(self, el: str) -> bool:
        """
        Check if an element is in the `Embedding` object.

        Args:
            el (str): An element symbol
        Returns:
            bool: True if el is in the Embedding, else False
        """
        if el in self.element_list:
            return True
        else:
            return False

    @property
    def element_groups_dict(self) -> Dict[str, str]:
        """
        Return a dictionary of {element: element type} pairs.

        e.g. {'He':'Noble gas'}

        """
        with open(path.join(data_directory, "element_data/element_group.json")) as f:
            _dict = json.load(f)
        return {i: _dict[i] for i in self.element_list}

    def create_pairs(self):
        """Create all possible pairs of elements."""
        ele_list = self.element_list
        ele_pairs = combinations_with_replacement(ele_list, 2)
        return ele_pairs

    def compute_correlation_metric(
        self, ele1: str, ele2: str, metric: str = "pearson"
    ) -> float:
        """
        Compute the correlation/similarity metric between two vectors.

        Allowed metrics:
        * Pearson
        * Spearman
        * Cosine similarity

        Args:
            ele1 (str): element symbol
            ele2 (str): element symbol
            metric (str): name of a correlation metric.
            Options are "spearman", "pearson" and "cosine_similarity".

        Returns:
            float: correlation/similarity metric
        """
        # Define the allowable metrics
        scipy_corrs = {"pearson": pearsonr, "spearman": spearmanr}

        if metric == "pearson":
            return scipy_corrs[metric](
                self.embeddings[ele1], self.embeddings[ele2]
            ).statistic
        elif metric == "spearman":
            return scipy_corrs[metric](
                self.embeddings[ele1], self.embeddings[ele2]
            ).correlation
        elif metric == "cosine_similarity":
            return cosine_similarity(self.embeddings[ele1], self.embeddings[ele2])

    def compute_distance_metric(
        self, ele1: str, ele2: str, metric: str = "euclidean"
    ) -> float:
        """
        Compute distance metric between two vectors.

        Allowed metrics:

        * euclidean
        * manhattan
        * chebyshev
        * wasserstein
        * energy
        * cosine_distance

        Args:
            ele1 (str): element symbol
            ele2 (str): element symbol
            metric (str): name of a distance metric

        Returns:
            distance (float): distance between embedding vectors
        """
        # Define the allowable metrics
        scikit_metrics = ["euclidean", "manhattan", "chebyshev"]

        scipy_metrics = {"wasserstein": wasserstein_distance, "energy": energy_distance}

        valid_metrics = scikit_metrics + list(scipy_metrics.keys()) + ["cosine"]

        # Validate if the elements are within the embedding vector
        if not all([self._is_el_in_embedding(ele1), self._is_el_in_embedding(ele2)]):
            if not self._is_el_in_embedding(ele1):
                print(f"{ele1} is not an element included within the atomic embeddings")
                raise ValueError

            elif not self._is_el_in_embedding(ele2):
                print(f"{ele2} is not an element included within the atomic embeddings")
                raise ValueError

        # Compute the distance measure
        if metric in scikit_metrics:
            distance = DistanceMetric.get_metric(metric)

            return distance.pairwise(
                self.embeddings[ele1].reshape(1, -1),
                self.embeddings[ele2].reshape(1, -1),
            )[0][0]

        elif metric in scipy_metrics.keys():
            return scipy_metrics[metric](self.embeddings[ele1], self.embeddings[ele2])
        elif metric == "cosine_distance":
            return cosine_distance(self.embeddings[ele1], self.embeddings[ele2])

        else:
            print(
                "Invalid distance metric."
                f"Use one of the following metrics:{valid_metrics}"
            )
            raise ValueError

    def distance_df(self, metric: str = "euclidean") -> pd.DataFrame:
        """
        Return a dataframe with columns ["ele_1", "ele_2", metric].

        Allowed metrics:

        * euclidean
        * manhattan
        * chebyshev
        * wasserstein
        * energy

        Args:
            metric (str): A distance metric.

        Returns:
            df (pandas.DataFrame): A dataframe with columns ["ele_1", "ele_2", metric].
        """
        ele_pairs = self.create_pairs()
        table = []
        for ele1, ele2 in ele_pairs:
            dist = self.compute_distance_metric(ele1, ele2, metric=metric)
            table.append((ele1, ele2, dist))
            if ele1 != ele2:
                table.append((ele2, ele1, dist))
        corr_df = pd.DataFrame(table, columns=["ele_1", "ele_2", metric])

        mend_1 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_1"]]
        mend_2 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_2"]]

        Z_1 = [(pt[ele]["number"], ele) for ele in corr_df["ele_1"]]
        Z_2 = [(pt[ele]["number"], ele) for ele in corr_df["ele_2"]]

        corr_df["mend_1"] = mend_1
        corr_df["mend_2"] = mend_2

        corr_df["Z_1"] = Z_1
        corr_df["Z_2"] = Z_2

        corr_df = corr_df[["ele_1", "ele_2", "mend_1", "mend_2", "Z_1", "Z_2", metric]]

        return corr_df

    def correlation_df(self, metric: str = "pearson") -> pd.DataFrame:
        """
        Return a dataframe with columns ["ele_1", "ele_2", metric].

        Allowed metrics:

        * pearson
        * spearman
        * cosine_similarity


        Args:
            metric (str): A distance metric.

        Returns:
            df (pandas.DataFrame): A dataframe with columns ["ele_1", "ele_2", metric].
        """
        ele_pairs = self.create_pairs()
        table = []
        for ele1, ele2 in ele_pairs:
            dist = self.compute_correlation_metric(ele1, ele2, metric=metric)
            table.append((ele1, ele2, dist))
            if ele1 != ele2:
                table.append((ele2, ele1, dist))
        corr_df = pd.DataFrame(table, columns=["ele_1", "ele_2", metric])

        mend_1 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_1"]]
        mend_2 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_2"]]

        Z_1 = [(pt[ele]["number"], ele) for ele in corr_df["ele_1"]]
        Z_2 = [(pt[ele]["number"], ele) for ele in corr_df["ele_2"]]

        corr_df["mend_1"] = mend_1
        corr_df["mend_2"] = mend_2

        corr_df["Z_1"] = Z_1
        corr_df["Z_2"] = Z_2

        corr_df = corr_df[["ele_1", "ele_2", "mend_1", "mend_2", "Z_1", "Z_2", metric]]

        return corr_df

    def distance_pivot_table(
        self, metric: str = "euclidean", sortby: str = "mendeleev"
    ) -> pd.DataFrame:
        """
        Return a pandas.DataFrame style pivot.

        The index and column being either the mendeleev number or atomic number
        of the element pairs and the values being a user-specified distance metric.

        Args:
            metric (str): A distance metric.
            sortby (str): Sort the pivot table by either "mendeleev" or "atomic_number".

        Returns:
            distance_pivot (pandas.DataFrame): A pandas DataFrame pivot table.
        """
        corr_df = self.distance_df(metric=metric)
        if sortby == "mendeleev":
            distance_pivot = corr_df.pivot_table(
                values=metric, index="mend_1", columns="mend_2"
            )
            return distance_pivot
        elif sortby == "atomic_number":
            distance_pivot = corr_df.pivot_table(
                values=metric, index="Z_1", columns="Z_2"
            )
            return distance_pivot

    def correlation_pivot_table(
        self, metric: str = "pearson", sortby: str = "mendeleev"
    ) -> pd.DataFrame:
        """
        Return a pandas.DataFrame style pivot.

        The index and column being either the mendeleev number or atomic number
        of the element pairs and the values being a user-specified distance metric.

        Args:
            metric (str): A distance metric.
            sortby (str): Sort the pivot table by either "mendeleev" or "atomic_number".

        Returns:
            distance_pivot (pandas.DataFrame): A pandas DataFrame pivot table.
        """
        corr_df = self.correlation_df(metric=metric)
        if sortby == "mendeleev":
            correlation_pivot = corr_df.pivot_table(
                values=metric, index="mend_1", columns="mend_2"
            )
            return correlation_pivot
        elif sortby == "atomic_number":
            correlation_pivot = corr_df.pivot_table(
                values=metric, index="Z_1", columns="Z_2"
            )
            return correlation_pivot

    def calculate_PC(self, n_components: int = 2, standardise: bool = True, **kwargs):
        """Calculate the principal componenets (PC) of the embeddings.

        Args:
            n_components (int): The number of components to project the embeddings to.
            standardise (bool): Whether to standardise the embeddings before projecting.
            **kwargs: Other keyword arguments to be passed to PCA.
        """
        if standardise:
            if self.is_standardised:
                embeddings_array = np.array(list(self.embeddings.values()))
            else:
                self.standardise(inplace=True)
                embeddings_array = np.array(list(self.embeddings.values()))
        else:
            warnings.warn(
                """It is recommended to scale the embeddings
                before projecting with PCA.
                To do so, set `standardise=True`."""
            )
            embeddings_array = np.array(list(self.embeddings.values()))

        pca = decomposition.PCA(
            n_components=n_components, **kwargs
        )  # project to N dimensions
        pca.fit(embeddings_array)
        self._pca_data = pca.transform(embeddings_array)
        return self._pca_data

    def calculate_tSNE(self, n_components: int = 2, standardise: bool = True, **kwargs):
        """Calculate t-SNE components.

        Args:
            n_components (int): The number of components to project the embeddings to.
            standardise (bool): Whether to standardise the embeddings before projecting.
            **kwargs: Other keyword arguments to be passed to t-SNE.
        """
        if standardise:
            if self.is_standardised:
                embeddings_array = np.array(list(self.embeddings.values()))
            else:
                self.standardise(inplace=True)
                embeddings_array = np.array(list(self.embeddings.values()))
        else:
            warnings.warn(
                """It is recommended to scale the embeddings
                before projecting with t-SNE.
                To do so, set `standardise=True`."""
            )
            embeddings_array = np.array(list(self.embeddings.values()))

        tsne = TSNE(n_components=n_components, **kwargs)
        tsne_result = tsne.fit_transform(embeddings_array)
        self._tsne_data = tsne_result
        return self._tsne_data

    def calculate_UMAP(self, n_components: int = 2, standardise: bool = True, **kwargs):
        """Calculate UMAP embeddings.

        Args:
            n_components (int): The number of components to project the embeddings to.
            standardise (bool): Whether to scale the embeddings before projecting.
            **kwargs: Other keyword arguments to be passed to UMAP.
        """
        if standardise:
            if self.is_standardised:
                embeddings_array = np.array(list(self.embeddings.values()))
            else:
                self.standardise(inplace=True)
                embeddings_array = np.array(list(self.embeddings.values()))
        else:
            warnings.warn(
                """It is recommended to scale the embeddings
                before projecting with UMAP.
                To do so, set `standardise=True`."""
            )
            embeddings_array = np.array(list(self.embeddings.values()))

        umap = UMAP(n_components=n_components, **kwargs)
        umap_result = umap.fit_transform(embeddings_array)
        self._umap_data = umap_result
        return self._umap_data

element_groups_dict: Dict[str, str] property

Return a dictionary of {element: element type} pairs.

e.g. {'He':'Noble gas'}

element_list: list property

Return the elements of the embedding.

__init__(embeddings, embedding_name=None, feature_labels=None)

Initialise the Embedding class.

Parameters:

Name Type Description Default
embeddings dict

A {element_symbol: vector} dictionary

required
embedding_name str

The name of the elemental representation

None
feature_labels list(str

A list of feature labels

None
Source code in src/elementembeddings/core.py
def __init__(
    self,
    embeddings: dict,
    embedding_name: Optional[str] = None,
    feature_labels: Optional[List[str]] = None,
):
    """Initialise the Embedding class.

    Args:
        embeddings (dict): A {element_symbol: vector} dictionary
        embedding_name (str): The name of the elemental representation
        feature_labels (list(str)): A list of feature labels
    """
    self.embeddings = embeddings
    self.embedding_name = embedding_name
    self.feature_labels = feature_labels
    if not self._is_standardised():
        self.is_standardised = False
    else:
        self.is_standardised = True

    # Grab a random value from the embedding vector
    _rand_embed = random.choice(list(self.embeddings.values()))
    # Convert embeddings to numpy array if not already a numpy array
    if not isinstance(_rand_embed, np.ndarray):
        self.embeddings = {
            ele: np.array(self.embeddings[ele]) for ele in self.embeddings
        }

    # Determines if the embedding vector has a length attribute
    # (i.e. is not a scalar int or float)
    # If the 'vector' is a scalar/float, the representation is linear
    # A linear representation gets converted to a one-hot vector
    if hasattr(_rand_embed, "__len__") and (not isinstance(_rand_embed, str)):
        self.embedding_type: str = "vector"
        self.dim: int = len(random.choice(list(self.embeddings.values())))
    else:
        self.embedding_type: str = "linear"

    # Create one-hot vectors for a scalar representation
    if self.embedding_type == "linear":
        sorted_embedding = sorted(self.embeddings.items(), key=lambda x: x[1])
        elements = np.loadtxt(
            f"{data_directory}/element_data/ordered_periodic.txt", dtype=str
        )
        if self.embedding_name == "mod_petti":
            sorted_embedding = {
                el: num for el, num in sorted_embedding if el in elements[:103]
            }
        else:
            sorted_embedding = {
                el: num for el, num in sorted_embedding if el in elements[:118]
            }
        self.feature_labels = list(sorted_embedding.keys())
        self.embeddings = {}

        for el, num in sorted_embedding.items():
            self.embeddings[el] = np.zeros(len(sorted_embedding))
            self.embeddings[el][num] = 1
        self.dim = len(random.choice(list(self.embeddings.values())))

    if not self.feature_labels:
        self.feature_labels = list(range(self.dim))
    else:
        self.feature_labels = self.feature_labels

    # Dummy initialisation for results
    self._data = []
    self._pca_data = None  # type: Optional[np.ndarray]
    self._tsne_data = None  # type: Optional[np.ndarray]
    self._umap_data = None  # type: Optional[np.ndarray]

as_dataframe(columns='components')

Return the embedding as a pandas Dataframe.

The first column is the elements and each other column represents a component of the embedding.

Parameters:

Name Type Description Default
columns str

A string to specify if the columns are the vector components

'components'

Returns:

Name Type Description
df pandas.DataFrame

A pandas dataframe object

Source code in src/elementembeddings/core.py
def as_dataframe(self, columns: str = "components") -> pd.DataFrame:
    """
    Return the embedding as a pandas Dataframe.

    The first column is the elements and each other
    column represents a component of the embedding.

    Args:
        columns (str): A string to specify if the columns are the vector components
        and the index is the elements (`columns='components'`)
        or the columns are the elements (`columns='elements'`).

    Returns:
        df (pandas.DataFrame): A pandas dataframe object


    """
    embedding = self.embeddings
    df = pd.DataFrame(embedding, index=self.feature_labels)
    if columns == "components":
        return df.T
    elif columns == "elements":
        return df
    else:
        raise (
            ValueError(
                f"{columns} is not a valid keyword argument. "
                "Choose either 'components' or 'elements"
            )
        )

calculate_PC(n_components=2, standardise=True, **kwargs)

Calculate the principal componenets (PC) of the embeddings.

Parameters:

Name Type Description Default
n_components int

The number of components to project the embeddings to.

2
standardise bool

Whether to standardise the embeddings before projecting.

True
**kwargs

Other keyword arguments to be passed to PCA.

{}
Source code in src/elementembeddings/core.py
def calculate_PC(self, n_components: int = 2, standardise: bool = True, **kwargs):
    """Calculate the principal componenets (PC) of the embeddings.

    Args:
        n_components (int): The number of components to project the embeddings to.
        standardise (bool): Whether to standardise the embeddings before projecting.
        **kwargs: Other keyword arguments to be passed to PCA.
    """
    if standardise:
        if self.is_standardised:
            embeddings_array = np.array(list(self.embeddings.values()))
        else:
            self.standardise(inplace=True)
            embeddings_array = np.array(list(self.embeddings.values()))
    else:
        warnings.warn(
            """It is recommended to scale the embeddings
            before projecting with PCA.
            To do so, set `standardise=True`."""
        )
        embeddings_array = np.array(list(self.embeddings.values()))

    pca = decomposition.PCA(
        n_components=n_components, **kwargs
    )  # project to N dimensions
    pca.fit(embeddings_array)
    self._pca_data = pca.transform(embeddings_array)
    return self._pca_data

calculate_UMAP(n_components=2, standardise=True, **kwargs)

Calculate UMAP embeddings.

Parameters:

Name Type Description Default
n_components int

The number of components to project the embeddings to.

2
standardise bool

Whether to scale the embeddings before projecting.

True
**kwargs

Other keyword arguments to be passed to UMAP.

{}
Source code in src/elementembeddings/core.py
def calculate_UMAP(self, n_components: int = 2, standardise: bool = True, **kwargs):
    """Calculate UMAP embeddings.

    Args:
        n_components (int): The number of components to project the embeddings to.
        standardise (bool): Whether to scale the embeddings before projecting.
        **kwargs: Other keyword arguments to be passed to UMAP.
    """
    if standardise:
        if self.is_standardised:
            embeddings_array = np.array(list(self.embeddings.values()))
        else:
            self.standardise(inplace=True)
            embeddings_array = np.array(list(self.embeddings.values()))
    else:
        warnings.warn(
            """It is recommended to scale the embeddings
            before projecting with UMAP.
            To do so, set `standardise=True`."""
        )
        embeddings_array = np.array(list(self.embeddings.values()))

    umap = UMAP(n_components=n_components, **kwargs)
    umap_result = umap.fit_transform(embeddings_array)
    self._umap_data = umap_result
    return self._umap_data

calculate_tSNE(n_components=2, standardise=True, **kwargs)

Calculate t-SNE components.

Parameters:

Name Type Description Default
n_components int

The number of components to project the embeddings to.

2
standardise bool

Whether to standardise the embeddings before projecting.

True
**kwargs

Other keyword arguments to be passed to t-SNE.

{}
Source code in src/elementembeddings/core.py
def calculate_tSNE(self, n_components: int = 2, standardise: bool = True, **kwargs):
    """Calculate t-SNE components.

    Args:
        n_components (int): The number of components to project the embeddings to.
        standardise (bool): Whether to standardise the embeddings before projecting.
        **kwargs: Other keyword arguments to be passed to t-SNE.
    """
    if standardise:
        if self.is_standardised:
            embeddings_array = np.array(list(self.embeddings.values()))
        else:
            self.standardise(inplace=True)
            embeddings_array = np.array(list(self.embeddings.values()))
    else:
        warnings.warn(
            """It is recommended to scale the embeddings
            before projecting with t-SNE.
            To do so, set `standardise=True`."""
        )
        embeddings_array = np.array(list(self.embeddings.values()))

    tsne = TSNE(n_components=n_components, **kwargs)
    tsne_result = tsne.fit_transform(embeddings_array)
    self._tsne_data = tsne_result
    return self._tsne_data

citation()

Return a citation for the embedding.

Source code in src/elementembeddings/core.py
def citation(self) -> List[str]:
    """Return a citation for the embedding."""
    if self.embedding_name in ["magpie", "magpie_sc"]:
        citation = [
            "@article{ward2016general,"
            "title={A general-purpose machine learning framework for "
            "predicting properties of inorganic materials},"
            "author={Ward, Logan and Agrawal, Ankit and Choudhary, Alok "
            "and Wolverton, Christopher},"
            "journal={npj Computational Materials},"
            "volume={2},"
            "number={1},"
            "pages={1--7},"
            "year={2016},"
            "publisher={Nature Publishing Group}}"
        ]
    elif self.embedding_name == "mat2vec":
        citation = [
            "@article{tshitoyan2019unsupervised,"
            "title={Unsupervised word embeddings capture latent knowledge "
            "from materials science literature},"
            "author={Tshitoyan, Vahe and Dagdelen, John and Weston, Leigh "
            "and Dunn, Alexander and Rong, Ziqin and Kononova, Olga "
            "and Persson, Kristin A and Ceder, Gerbrand and Jain, Anubhav},"
            "journal={Nature},"
            "volume={571},"
            "number={7763},"
            "pages={95--98},"
            "year={2019},"
            "publisher={Nature Publishing Group} }"
        ]
    elif self.embedding_name == "matscholar":
        citation = [
            "@article{weston2019named,"
            "title={Named entity recognition and normalization applied to "
            "large-scale information extraction from the materials "
            "science literature},"
            "author={Weston, Leigh and Tshitoyan, Vahe and Dagdelen, John and "
            "Kononova, Olga and Trewartha, Amalie and Persson, Kristin A and "
            "Ceder, Gerbrand and Jain, Anubhav},"
            "journal={Journal of chemical information and modeling},"
            "volume={59},"
            "number={9},"
            "pages={3692--3702},"
            "year={2019},"
            "publisher={ACS Publications} }"
        ]

    elif self.embedding_name == "megnet16":
        citation = [
            "@article{chen2019graph,"
            "title={Graph networks as a universal machine learning framework "
            "for molecules and crystals},"
            "author={Chen, Chi and Ye, Weike and Zuo, Yunxing and "
            "Zheng, Chen and Ong, Shyue Ping},"
            "journal={Chemistry of Materials},"
            "volume={31},"
            "number={9},"
            "pages={3564--3572},"
            "year={2019},"
            "publisher={ACS Publications} }"
        ]

    elif self.embedding_name in ["oliynyk", "oliynyk_sc"]:
        citation = [
            "              @article{oliynyk2016high,"
            "title={High-throughput machine-learning-driven synthesis "
            "of full-Heusler compounds},"
            "author={Oliynyk, Anton O and Antono, Erin and Sparks, Taylor D and "
            "Ghadbeigi, Leila and Gaultois, Michael W and "
            "Meredig, Bryce and Mar, Arthur},"
            "journal={Chemistry of Materials},"
            "volume={28},"
            "number={20},"
            "pages={7324--7331},"
            "year={2016},"
            "publisher={ACS Publications} }"
        ]

    elif self.embedding_name == "skipatom":
        citation = [
            "@article{antunes2022distributed,"
            "title={Distributed representations of atoms and materials "
            "for machine learning},"
            "author={Antunes, Luis M and Grau-Crespo, Ricardo and Butler, Keith T},"
            "journal={npj Computational Materials},"
            "volume={8},"
            "number={1},"
            "pages={1--9},"
            "year={2022},"
            "publisher={Nature Publishing Group} }"
        ]
    elif self.embedding_name == "mod_petti":
        citation = [
            "@article{glawe2016optimal,"
            "title={The optimal one dimensional periodic table: "
            "a modified Pettifor chemical scale from data mining},"
            "author={Glawe, Henning and Sanna, Antonio and Gross, "
            "EKU and Marques, Miguel AL},"
            "journal={New Journal of Physics},"
            "volume={18},"
            "number={9},"
            "pages={093011},"
            "year={2016},"
            "publisher={IOP Publishing} }"
        ]

    else:
        citation = []

    return citation

compute_correlation_metric(ele1, ele2, metric='pearson')

Compute the correlation/similarity metric between two vectors.

Allowed metrics: * Pearson * Spearman * Cosine similarity

Parameters:

Name Type Description Default
ele1 str

element symbol

required
ele2 str

element symbol

required
metric str

name of a correlation metric.

'pearson'

Returns:

Name Type Description
float float

correlation/similarity metric

Source code in src/elementembeddings/core.py
def compute_correlation_metric(
    self, ele1: str, ele2: str, metric: str = "pearson"
) -> float:
    """
    Compute the correlation/similarity metric between two vectors.

    Allowed metrics:
    * Pearson
    * Spearman
    * Cosine similarity

    Args:
        ele1 (str): element symbol
        ele2 (str): element symbol
        metric (str): name of a correlation metric.
        Options are "spearman", "pearson" and "cosine_similarity".

    Returns:
        float: correlation/similarity metric
    """
    # Define the allowable metrics
    scipy_corrs = {"pearson": pearsonr, "spearman": spearmanr}

    if metric == "pearson":
        return scipy_corrs[metric](
            self.embeddings[ele1], self.embeddings[ele2]
        ).statistic
    elif metric == "spearman":
        return scipy_corrs[metric](
            self.embeddings[ele1], self.embeddings[ele2]
        ).correlation
    elif metric == "cosine_similarity":
        return cosine_similarity(self.embeddings[ele1], self.embeddings[ele2])

compute_distance_metric(ele1, ele2, metric='euclidean')

Compute distance metric between two vectors.

Allowed metrics:

  • euclidean
  • manhattan
  • chebyshev
  • wasserstein
  • energy
  • cosine_distance

Parameters:

Name Type Description Default
ele1 str

element symbol

required
ele2 str

element symbol

required
metric str

name of a distance metric

'euclidean'

Returns:

Name Type Description
distance float

distance between embedding vectors

Source code in src/elementembeddings/core.py
def compute_distance_metric(
    self, ele1: str, ele2: str, metric: str = "euclidean"
) -> float:
    """
    Compute distance metric between two vectors.

    Allowed metrics:

    * euclidean
    * manhattan
    * chebyshev
    * wasserstein
    * energy
    * cosine_distance

    Args:
        ele1 (str): element symbol
        ele2 (str): element symbol
        metric (str): name of a distance metric

    Returns:
        distance (float): distance between embedding vectors
    """
    # Define the allowable metrics
    scikit_metrics = ["euclidean", "manhattan", "chebyshev"]

    scipy_metrics = {"wasserstein": wasserstein_distance, "energy": energy_distance}

    valid_metrics = scikit_metrics + list(scipy_metrics.keys()) + ["cosine"]

    # Validate if the elements are within the embedding vector
    if not all([self._is_el_in_embedding(ele1), self._is_el_in_embedding(ele2)]):
        if not self._is_el_in_embedding(ele1):
            print(f"{ele1} is not an element included within the atomic embeddings")
            raise ValueError

        elif not self._is_el_in_embedding(ele2):
            print(f"{ele2} is not an element included within the atomic embeddings")
            raise ValueError

    # Compute the distance measure
    if metric in scikit_metrics:
        distance = DistanceMetric.get_metric(metric)

        return distance.pairwise(
            self.embeddings[ele1].reshape(1, -1),
            self.embeddings[ele2].reshape(1, -1),
        )[0][0]

    elif metric in scipy_metrics.keys():
        return scipy_metrics[metric](self.embeddings[ele1], self.embeddings[ele2])
    elif metric == "cosine_distance":
        return cosine_distance(self.embeddings[ele1], self.embeddings[ele2])

    else:
        print(
            "Invalid distance metric."
            f"Use one of the following metrics:{valid_metrics}"
        )
        raise ValueError

correlation_df(metric='pearson')

Return a dataframe with columns ["ele_1", "ele_2", metric].

Allowed metrics:

  • pearson
  • spearman
  • cosine_similarity

Parameters:

Name Type Description Default
metric str

A distance metric.

'pearson'

Returns:

Name Type Description
df pandas.DataFrame

A dataframe with columns ["ele_1", "ele_2", metric].

Source code in src/elementembeddings/core.py
def correlation_df(self, metric: str = "pearson") -> pd.DataFrame:
    """
    Return a dataframe with columns ["ele_1", "ele_2", metric].

    Allowed metrics:

    * pearson
    * spearman
    * cosine_similarity


    Args:
        metric (str): A distance metric.

    Returns:
        df (pandas.DataFrame): A dataframe with columns ["ele_1", "ele_2", metric].
    """
    ele_pairs = self.create_pairs()
    table = []
    for ele1, ele2 in ele_pairs:
        dist = self.compute_correlation_metric(ele1, ele2, metric=metric)
        table.append((ele1, ele2, dist))
        if ele1 != ele2:
            table.append((ele2, ele1, dist))
    corr_df = pd.DataFrame(table, columns=["ele_1", "ele_2", metric])

    mend_1 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_1"]]
    mend_2 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_2"]]

    Z_1 = [(pt[ele]["number"], ele) for ele in corr_df["ele_1"]]
    Z_2 = [(pt[ele]["number"], ele) for ele in corr_df["ele_2"]]

    corr_df["mend_1"] = mend_1
    corr_df["mend_2"] = mend_2

    corr_df["Z_1"] = Z_1
    corr_df["Z_2"] = Z_2

    corr_df = corr_df[["ele_1", "ele_2", "mend_1", "mend_2", "Z_1", "Z_2", metric]]

    return corr_df

correlation_pivot_table(metric='pearson', sortby='mendeleev')

Return a pandas.DataFrame style pivot.

The index and column being either the mendeleev number or atomic number of the element pairs and the values being a user-specified distance metric.

Parameters:

Name Type Description Default
metric str

A distance metric.

'pearson'
sortby str

Sort the pivot table by either "mendeleev" or "atomic_number".

'mendeleev'

Returns:

Name Type Description
distance_pivot pandas.DataFrame

A pandas DataFrame pivot table.

Source code in src/elementembeddings/core.py
def correlation_pivot_table(
    self, metric: str = "pearson", sortby: str = "mendeleev"
) -> pd.DataFrame:
    """
    Return a pandas.DataFrame style pivot.

    The index and column being either the mendeleev number or atomic number
    of the element pairs and the values being a user-specified distance metric.

    Args:
        metric (str): A distance metric.
        sortby (str): Sort the pivot table by either "mendeleev" or "atomic_number".

    Returns:
        distance_pivot (pandas.DataFrame): A pandas DataFrame pivot table.
    """
    corr_df = self.correlation_df(metric=metric)
    if sortby == "mendeleev":
        correlation_pivot = corr_df.pivot_table(
            values=metric, index="mend_1", columns="mend_2"
        )
        return correlation_pivot
    elif sortby == "atomic_number":
        correlation_pivot = corr_df.pivot_table(
            values=metric, index="Z_1", columns="Z_2"
        )
        return correlation_pivot

create_pairs()

Create all possible pairs of elements.

Source code in src/elementembeddings/core.py
def create_pairs(self):
    """Create all possible pairs of elements."""
    ele_list = self.element_list
    ele_pairs = combinations_with_replacement(ele_list, 2)
    return ele_pairs

distance_df(metric='euclidean')

Return a dataframe with columns ["ele_1", "ele_2", metric].

Allowed metrics:

  • euclidean
  • manhattan
  • chebyshev
  • wasserstein
  • energy

Parameters:

Name Type Description Default
metric str

A distance metric.

'euclidean'

Returns:

Name Type Description
df pandas.DataFrame

A dataframe with columns ["ele_1", "ele_2", metric].

Source code in src/elementembeddings/core.py
def distance_df(self, metric: str = "euclidean") -> pd.DataFrame:
    """
    Return a dataframe with columns ["ele_1", "ele_2", metric].

    Allowed metrics:

    * euclidean
    * manhattan
    * chebyshev
    * wasserstein
    * energy

    Args:
        metric (str): A distance metric.

    Returns:
        df (pandas.DataFrame): A dataframe with columns ["ele_1", "ele_2", metric].
    """
    ele_pairs = self.create_pairs()
    table = []
    for ele1, ele2 in ele_pairs:
        dist = self.compute_distance_metric(ele1, ele2, metric=metric)
        table.append((ele1, ele2, dist))
        if ele1 != ele2:
            table.append((ele2, ele1, dist))
    corr_df = pd.DataFrame(table, columns=["ele_1", "ele_2", metric])

    mend_1 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_1"]]
    mend_2 = [(Element(ele).mendeleev_no, ele) for ele in corr_df["ele_2"]]

    Z_1 = [(pt[ele]["number"], ele) for ele in corr_df["ele_1"]]
    Z_2 = [(pt[ele]["number"], ele) for ele in corr_df["ele_2"]]

    corr_df["mend_1"] = mend_1
    corr_df["mend_2"] = mend_2

    corr_df["Z_1"] = Z_1
    corr_df["Z_2"] = Z_2

    corr_df = corr_df[["ele_1", "ele_2", "mend_1", "mend_2", "Z_1", "Z_2", metric]]

    return corr_df

distance_pivot_table(metric='euclidean', sortby='mendeleev')

Return a pandas.DataFrame style pivot.

The index and column being either the mendeleev number or atomic number of the element pairs and the values being a user-specified distance metric.

Parameters:

Name Type Description Default
metric str

A distance metric.

'euclidean'
sortby str

Sort the pivot table by either "mendeleev" or "atomic_number".

'mendeleev'

Returns:

Name Type Description
distance_pivot pandas.DataFrame

A pandas DataFrame pivot table.

Source code in src/elementembeddings/core.py
def distance_pivot_table(
    self, metric: str = "euclidean", sortby: str = "mendeleev"
) -> pd.DataFrame:
    """
    Return a pandas.DataFrame style pivot.

    The index and column being either the mendeleev number or atomic number
    of the element pairs and the values being a user-specified distance metric.

    Args:
        metric (str): A distance metric.
        sortby (str): Sort the pivot table by either "mendeleev" or "atomic_number".

    Returns:
        distance_pivot (pandas.DataFrame): A pandas DataFrame pivot table.
    """
    corr_df = self.distance_df(metric=metric)
    if sortby == "mendeleev":
        distance_pivot = corr_df.pivot_table(
            values=metric, index="mend_1", columns="mend_2"
        )
        return distance_pivot
    elif sortby == "atomic_number":
        distance_pivot = corr_df.pivot_table(
            values=metric, index="Z_1", columns="Z_2"
        )
        return distance_pivot

from_csv(embedding_csv, embedding_name=None) staticmethod

Create an instance of the Embedding class from a csv file.

The first column of the csv file must contain the elements and be named element.

Parameters:

Name Type Description Default
embedding_csv str

Filepath of the csv file

required
embedding_name str

The name of the elemental representation

None
Source code in src/elementembeddings/core.py
@staticmethod
def from_csv(embedding_csv, embedding_name: Optional[str] = None):
    """
    Create an instance of the Embedding class from a csv file.

    The first column of the csv file must contain the elements and be named element.

    Args:
        embedding_csv (str): Filepath of the csv file
        embedding_name (str): The name of the elemental representation

    """
    # Need to add validation handling for csv files
    df = pd.read_csv(embedding_csv)
    elements = list(df["element"])
    df.drop(["element"], axis=1, inplace=True)
    feature_labels = list(df.columns)
    embeds_array = df.to_numpy()
    embedding_data = {
        elements[i]: embeds_array[i] for i in range(len(embeds_array))
    }
    return Embedding(embedding_data, embedding_name, feature_labels)

from_json(embedding_json, embedding_name=None) staticmethod

Create an instance of the Embedding class from a json file.

Parameters:

Name Type Description Default
embedding_json str

Filepath of the json file

required
embedding_name str

The name of the elemental representation

None
Source code in src/elementembeddings/core.py
@staticmethod
def from_json(embedding_json, embedding_name: Optional[str] = None):
    """
    Create an instance of the Embedding class from a json file.

    Args:
        embedding_json (str): Filepath of the json file
        embedding_name (str): The name of the elemental representation
    """
    # Need to add validation handling for JSONs in different formats
    with open(embedding_json) as f:
        embedding_data = json.load(f)
    return Embedding(embedding_data, embedding_name)

load_data(embedding_name=None) staticmethod

Create an instance of the Embedding class from a default embedding file.

The default embeddings are in the table below:

Name str_name
Magpie magpie
Magpie (scaled) magpie_sc
Mat2Vec mat2vec
Matscholar matscholar
Megnet (16 dimensions) megnet16
Modified pettifor scale mod_petti
Oliynyk oliynyk
Oliynyk (scaled) oliynyk_sc
Random (200 dimensions) random_200
SkipAtom skipatom
Atomic Number atomic

Parameters:

Name Type Description Default
embedding_name str

The str_name of an embedding file.

None

Returns:

Name Type Description
Embedding

class:Embedding instance.

Source code in src/elementembeddings/core.py
@staticmethod
def load_data(embedding_name: Optional[str] = None):
    """
    Create an instance of the `Embedding` class from a default embedding file.

    The default embeddings are in the table below:

    | **Name**                | **str_name** |
    |-------------------------|--------------|
    | Magpie                  | magpie       |
    | Magpie (scaled)         | magpie_sc    |
    | Mat2Vec                 | mat2vec      |
    | Matscholar              | matscholar   |
    | Megnet (16 dimensions)  | megnet16     |
    | Modified pettifor scale | mod_petti    |
    | Oliynyk                 | oliynyk      |
    | Oliynyk (scaled)        | oliynyk_sc   |
    | Random (200 dimensions) | random_200   |
    | SkipAtom                | skipatom     |
    | Atomic Number           | atomic       |


    Args:
        embedding_name (str): The str_name of an embedding file.

    Returns:
        Embedding :class:`Embedding` instance.
    """
    _cbfv_files = {
        "magpie": "magpie.csv",
        "magpie_sc": "magpie_sc.json",
        "mat2vec": "mat2vec.csv",
        "matscholar": "matscholar-embedding.json",
        "megnet16": "megnet16.json",
        "mod_petti": "mod_petti.json",
        "oliynyk": "oliynyk_preprocessed.csv",
        "oliynyk_sc": "oliynyk_sc.json",
        "random_200": "random_200_new.csv",
        "skipatom": "skipatom_20201009_induced.csv",
        "atomic": "atomic.json",
    }

    if _cbfv_files[embedding_name].endswith(".csv"):
        return Embedding.from_csv(
            path.join(
                data_directory,
                "element_representations",
                _cbfv_files[embedding_name],
            ),
            embedding_name,
        )
    elif "megnet" in _cbfv_files[embedding_name]:
        return Embedding.from_json(
            path.join(
                data_directory,
                "element_representations",
                _cbfv_files[embedding_name],
            ),
            embedding_name,
        ).remove_elements(["Null"])
    elif _cbfv_files[embedding_name].endswith(".json"):
        return Embedding.from_json(
            path.join(
                data_directory,
                "element_representations",
                _cbfv_files[embedding_name],
            ),
            embedding_name,
        )

remove_elements(elements, inplace=False)

Remove elements from the Embedding instance.

Parameters:

Name Type Description Default
elements str,list(str

An element symbol or a list of element symbols

required
inplace bool

If True, elements are removed from the Embedding instance.

False
Source code in src/elementembeddings/core.py
def remove_elements(self, elements: Union[str, List[str]], inplace: bool = False):
    # TO-DO allow removal by atomic numbers
    """
    Remove elements from the Embedding instance.

    Args:
        elements (str,list(str)): An element symbol or a list of element symbols
        inplace (bool): If True, elements are removed from the Embedding instance.
        If false, the original embedding instance is unchanged
        and a new embedding instance with the elements removed is created.

    """
    if inplace:
        if isinstance(elements, str):
            del self.embeddings[elements]
        elif isinstance(elements, list):
            for el in elements:
                del self.embeddings[el]
        return None
    else:
        embeddings_copy = self.embeddings.copy()
        if isinstance(elements, str):
            del embeddings_copy[elements]
        elif isinstance(elements, list):
            for el in elements:
                del embeddings_copy[el]
        return Embedding(embeddings_copy, self.embedding_name)

standardise(inplace=False)

Standardise the embeddings.

Mean is 0 and standard deviation is 1.

Source code in src/elementembeddings/core.py
def standardise(self, inplace: bool = False):
    """Standardise the embeddings.

    Mean is 0 and standard deviation is 1.

    """
    if self._is_standardised():
        warnings.warn(
            "Embedding is already standardised. "
            "Returning None and not changing the embedding."
        )
        return None
    else:
        embeddings_copy = self.embeddings.copy()
        embeddings_array = np.array(list(embeddings_copy.values()))
        embeddings_array = StandardScaler().fit_transform(embeddings_array)
        for el, emb in zip(embeddings_copy.keys(), embeddings_array):
            embeddings_copy[el] = emb

        if inplace:
            self.embeddings = embeddings_copy
            self.is_standardised = True
            return None
        else:
            return Embedding(embeddings_copy, self.embedding_name)

to(fmt='', filename='')

Output the embedding to a file.

Parameters:

Name Type Description Default
fmt str

The file format to output the embedding to.

''
filename str

The name of the file to be outputted

''

Returns:

Type Description

(str) if filename not specified, otherwise None.

Source code in src/elementembeddings/core.py
def to(self, fmt: str = "", filename: Optional[str] = ""):
    """
    Output the embedding to a file.

    Args:
        fmt (str): The file format to output the embedding to.
        Options include "json" and "csv".
        filename (str): The name of the file to be outputted
    Returns:
        (str) if filename not specified, otherwise None.
    """
    fmt = fmt.lower()

    if fmt == "json" or fnmatch.fnmatch(filename, "*.json"):
        j = json.dumps(self.embeddings, cls=NumpyEncoder)
        if filename:
            if not filename.endswith(".json"):
                filename = filename + ".json"
            with open(filename, "w") as file:
                file.write(j)
        else:
            return j
    elif fmt == "csv" or fnmatch.fnmatch(filename, "*.csv"):
        if filename:
            if not filename.endswith(".csv"):
                filename = filename + ".csv"
            self.as_dataframe().to_csv(filename, index_label="element")
        else:
            return self.as_dataframe().to_csv(index_label="element")

    else:
        raise ValueError(f"{str(fmt)} is an invalid file format")