aton.txt.extract

Description

Functions to extract specific data from raw text strings, simplifying the use of regular expresions.

Index

number()
string()
column()
coords()
element()
isotope()

Examples

To extract a float value from a string,

from aton import txt
line = 'energy =   500.0 Ry'
txt.extract.number(line, 'energy')
# 500.0  (float output)

To extract a text value, after and before specific strings,

line = 'energy =   500.0 Ry were calculated'
txt.extract.string(line, 'energy', 'were')
# '500.0 Ry'  (String output)

To extract a value from a specific column,

# Name, Energy, Force, Error
line = 'Testing    1.1    1.2    0.3'
energy = txt.extract.column(line, 1)
# '1.1'  (String output)

To extract coordinates,

line = ' He  0.10  0.20  0.30 '
txt.extract.coords(line)
# [0.1, 0.2, 0.3]  (List of floats)

To extract chemical elements,

line = ' He4  0.10  Ag  0.20  Pb  0.30 '
first_element = txt.extract.element(line, 0)
# 'He4'
third_element = txt.extract.element(line, 2)
# 'Pb'

To split an isotope name into its element and mass number,

txt.extract.isotope('He4')
# ('He', 4)  (symbol, mass number)
txt.extract.isotope('Au')
# ('Au', 0)  (No mass number)

  1"""
  2# Description
  3
  4Functions to extract specific data from raw text strings,
  5simplifying the use of regular expresions.
  6
  7
  8# Index
  9
 10`number()`  
 11`string()`  
 12`column()`  
 13`coords()`  
 14`element()`  
 15`isotope()`  
 16
 17
 18# Examples
 19
 20To extract a float value from a string,
 21```python
 22from aton import txt
 23line = 'energy =   500.0 Ry'
 24txt.extract.number(line, 'energy')
 25# 500.0  (float output)
 26```
 27
 28To extract a text value, after and before specific strings,
 29```python
 30line = 'energy =   500.0 Ry were calculated'
 31txt.extract.string(line, 'energy', 'were')
 32# '500.0 Ry'  (String output)
 33```
 34
 35To extract a value from a specific column,
 36```python
 37# Name, Energy, Force, Error
 38line = 'Testing    1.1    1.2    0.3'
 39energy = txt.extract.column(line, 1)
 40# '1.1'  (String output)
 41```
 42
 43To extract coordinates,
 44```python
 45line = ' He  0.10  0.20  0.30 '
 46txt.extract.coords(line)
 47# [0.1, 0.2, 0.3]  (List of floats)
 48```
 49
 50To extract chemical elements,
 51```python
 52line = ' He4  0.10  Ag  0.20  Pb  0.30 '
 53first_element = txt.extract.element(line, 0)
 54# 'He4'
 55third_element = txt.extract.element(line, 2)
 56# 'Pb'
 57```
 58
 59To split an isotope name into its element and mass number,
 60```python
 61txt.extract.isotope('He4')
 62# ('He', 4)  (symbol, mass number)
 63txt.extract.isotope('Au')
 64# ('Au', 0)  (No mass number)
 65```
 66
 67---
 68"""
 69
 70
 71import re
 72import periodictable
 73
 74
 75def number(
 76        text:str,
 77        name:str=''
 78    ) -> float:
 79    """Extracts the float value of a given `name` variable from a raw `text`."""
 80    if text == None:
 81        return None
 82    pattern = re.compile(rf"{name}\s*[:=]?\s*(-?\d+(?:\.\d+)?(?:[eEdD][+\-]?\d+)?)")
 83    match = pattern.search(text)
 84    if match:
 85        return float(match.group(1))
 86    return None
 87    
 88
 89def string(
 90        text:str,
 91        name:str='',
 92        stop:str='',
 93        strip:bool=True
 94    ) -> str:
 95    """Extracts the `text` value of a given `name` variable from a raw string. Stops before an optional `stop` string.
 96
 97    Removes leading and trailing commas by default, change this with `strip = False`.
 98    """
 99    pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)")
100    if stop:
101        pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)(?={stop})")
102    match = re.search(pattern, text)
103    if not match:
104        return None
105    result = str(match.group(1))
106    result = result.strip()
107    if strip:
108        result = result.strip("'")
109        result = result.strip('"')
110        result = result.strip()
111    return result
112
113
114def column(
115        text:str,
116        column:int=0
117    ) -> str:
118    """Extracts the desired `column` index of a given `string` (0 by default)."""
119    if text is None:
120        return None
121    columns = text.split()
122    pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)'
123    if column < len(columns):
124        match = re.match(pattern, columns[column])
125        if match:
126            return match.group(1)
127    return None
128
129
130def coords(text:str) -> list:
131    """Returns a list with the float coordinates expressed in a given `text` string."""
132    if text is None:
133        return None
134    columns = re.split(r'[,\s]+', text.strip())
135    pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)'
136    matches = []
137    for column in columns:
138        match = re.match(pattern, column)
139        if match:
140            matches.append(float(match.group(1)))
141    return matches
142
143
144def element(
145        text:str,
146        index:int=0,
147        raise_errors=True,
148    ) -> str:
149    """Extract a chemical element from a raw `text` string.
150
151    If there are several elements, you can return a specific `index` match (positive, 0 by default).
152    Allows for standard elements (H, He, Na...) and isotopes (H2, He4...).
153    An error is raised if no valid element or isotope is found;
154    to override this and simply return an empty string instead, set `raise_errors=False`.
155    """
156    if text is None:
157        return None
158    columns = re.split(r'[,\s]+', text.strip())
159    pattern = r'\s*([A-Z][a-z]{0,2}\d{0,3})(?=\s|$)'
160    matches = []
161    for column in columns:
162        match = re.match(pattern, column)
163        if match:
164            matches.append(str(match.group(1)))
165    # We have a list with possible matches. Let's determine which are actual elements.
166    found_elements = []
167    for candidate in matches:
168        candidate = candidate.strip()
169        try:
170            symbol, mass_number = isotope(candidate)
171        except:  # It is not a valid atom
172            continue
173        found_elements.append(candidate)
174    if len(found_elements) == 0:
175        if raise_errors:
176            raise ValueError(f'No valid element nor isotope found in the string:\n{text}')
177        return ''
178    if len(found_elements) <= index:
179        return found_elements[-1]
180    return found_elements[index]
181
182
183def isotope(name:str, raise_errors=True) -> tuple:
184    """Split the `name` of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4).
185
186    The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0).
187    If the element or isotope does not exist, it raises an error;
188    to override this and simply return `('',0)` instead, set `raise_errors=False`.
189    """
190    name = name.strip("'")
191    name = name.strip('"')
192    name = name.strip()
193    symbol = ''.join(filter(str.isalpha, name))
194    mass_number = ''.join(filter(str.isdigit, name))
195    if mass_number:
196        mass_number = int(mass_number)
197    else:
198        mass_number = 0
199    # Check that the element exists
200    if not symbol in [a.symbol for a in [e for e in periodictable.elements]]:
201        if not raise_errors:
202            return ('', 0)
203        raise KeyError(f'Unrecognised element: {symbol}')
204    if mass_number != 0:
205        isotopes = periodictable.elements.symbol(symbol).isotopes
206        if not mass_number in isotopes:
207            if not raise_errors:
208                return ('', 0)
209            raise KeyError(f'Unrecognised isotope: {name}. Allowed mass numbers for {symbol} are: {isotopes}')
210    return symbol, mass_number
def number(text: str, name: str = '') -> float:
76def number(
77        text:str,
78        name:str=''
79    ) -> float:
80    """Extracts the float value of a given `name` variable from a raw `text`."""
81    if text == None:
82        return None
83    pattern = re.compile(rf"{name}\s*[:=]?\s*(-?\d+(?:\.\d+)?(?:[eEdD][+\-]?\d+)?)")
84    match = pattern.search(text)
85    if match:
86        return float(match.group(1))
87    return None

Extracts the float value of a given name variable from a raw text.

def string(text: str, name: str = '', stop: str = '', strip: bool = True) -> str:
 90def string(
 91        text:str,
 92        name:str='',
 93        stop:str='',
 94        strip:bool=True
 95    ) -> str:
 96    """Extracts the `text` value of a given `name` variable from a raw string. Stops before an optional `stop` string.
 97
 98    Removes leading and trailing commas by default, change this with `strip = False`.
 99    """
100    pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)")
101    if stop:
102        pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)(?={stop})")
103    match = re.search(pattern, text)
104    if not match:
105        return None
106    result = str(match.group(1))
107    result = result.strip()
108    if strip:
109        result = result.strip("'")
110        result = result.strip('"')
111        result = result.strip()
112    return result

Extracts the text value of a given name variable from a raw string. Stops before an optional stop string.

Removes leading and trailing commas by default, change this with strip = False.

def column(text: str, column: int = 0) -> str:
115def column(
116        text:str,
117        column:int=0
118    ) -> str:
119    """Extracts the desired `column` index of a given `string` (0 by default)."""
120    if text is None:
121        return None
122    columns = text.split()
123    pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)'
124    if column < len(columns):
125        match = re.match(pattern, columns[column])
126        if match:
127            return match.group(1)
128    return None

Extracts the desired column index of a given string (0 by default).

def coords(text: str) -> list:
131def coords(text:str) -> list:
132    """Returns a list with the float coordinates expressed in a given `text` string."""
133    if text is None:
134        return None
135    columns = re.split(r'[,\s]+', text.strip())
136    pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)'
137    matches = []
138    for column in columns:
139        match = re.match(pattern, column)
140        if match:
141            matches.append(float(match.group(1)))
142    return matches

Returns a list with the float coordinates expressed in a given text string.

def element(text: str, index: int = 0, raise_errors=True) -> str:
145def element(
146        text:str,
147        index:int=0,
148        raise_errors=True,
149    ) -> str:
150    """Extract a chemical element from a raw `text` string.
151
152    If there are several elements, you can return a specific `index` match (positive, 0 by default).
153    Allows for standard elements (H, He, Na...) and isotopes (H2, He4...).
154    An error is raised if no valid element or isotope is found;
155    to override this and simply return an empty string instead, set `raise_errors=False`.
156    """
157    if text is None:
158        return None
159    columns = re.split(r'[,\s]+', text.strip())
160    pattern = r'\s*([A-Z][a-z]{0,2}\d{0,3})(?=\s|$)'
161    matches = []
162    for column in columns:
163        match = re.match(pattern, column)
164        if match:
165            matches.append(str(match.group(1)))
166    # We have a list with possible matches. Let's determine which are actual elements.
167    found_elements = []
168    for candidate in matches:
169        candidate = candidate.strip()
170        try:
171            symbol, mass_number = isotope(candidate)
172        except:  # It is not a valid atom
173            continue
174        found_elements.append(candidate)
175    if len(found_elements) == 0:
176        if raise_errors:
177            raise ValueError(f'No valid element nor isotope found in the string:\n{text}')
178        return ''
179    if len(found_elements) <= index:
180        return found_elements[-1]
181    return found_elements[index]

Extract a chemical element from a raw text string.

If there are several elements, you can return a specific index match (positive, 0 by default). Allows for standard elements (H, He, Na...) and isotopes (H2, He4...). An error is raised if no valid element or isotope is found; to override this and simply return an empty string instead, set raise_errors=False.

def isotope(name: str, raise_errors=True) -> tuple:
184def isotope(name:str, raise_errors=True) -> tuple:
185    """Split the `name` of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4).
186
187    The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0).
188    If the element or isotope does not exist, it raises an error;
189    to override this and simply return `('',0)` instead, set `raise_errors=False`.
190    """
191    name = name.strip("'")
192    name = name.strip('"')
193    name = name.strip()
194    symbol = ''.join(filter(str.isalpha, name))
195    mass_number = ''.join(filter(str.isdigit, name))
196    if mass_number:
197        mass_number = int(mass_number)
198    else:
199        mass_number = 0
200    # Check that the element exists
201    if not symbol in [a.symbol for a in [e for e in periodictable.elements]]:
202        if not raise_errors:
203            return ('', 0)
204        raise KeyError(f'Unrecognised element: {symbol}')
205    if mass_number != 0:
206        isotopes = periodictable.elements.symbol(symbol).isotopes
207        if not mass_number in isotopes:
208            if not raise_errors:
209                return ('', 0)
210            raise KeyError(f'Unrecognised isotope: {name}. Allowed mass numbers for {symbol} are: {isotopes}')
211    return symbol, mass_number

Split the name of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4).

The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0). If the element or isotope does not exist, it raises an error; to override this and simply return ('',0) instead, set raise_errors=False.