aton.txt.extract
Description
Functions to extract specific data from raw text strings, simplifying the use of regular expresions.
Index
number()
string()
column()
coords()
element()
isotope()
Examples
To extract a float value from a string,
from aton import txt
line = 'energy = 500.0 Ry'
txt.extract.number(line, 'energy')
# 500.0 (float output)
To extract a text value, after and before specific strings,
line = 'energy = 500.0 Ry were calculated'
txt.extract.string(line, 'energy', 'were')
# '500.0 Ry' (String output)
To extract a value from a specific column,
# Name, Energy, Force, Error
line = 'Testing 1.1 1.2 0.3'
energy = txt.extract.column(line, 1)
# '1.1' (String output)
To extract coordinates,
line = ' He 0.10 0.20 0.30 '
txt.extract.coords(line)
# [0.1, 0.2, 0.3] (List of floats)
To extract chemical elements,
line = ' He4 0.10 Ag 0.20 Pb 0.30 '
first_element = txt.extract.element(line, 0)
# 'He4'
third_element = txt.extract.element(line, 2)
# 'Pb'
To split an isotope name into its element and mass number,
txt.extract.isotope('He4')
# ('He', 4) (symbol, mass number)
txt.extract.isotope('Au')
# ('Au', 0) (No mass number)
1""" 2# Description 3 4Functions to extract specific data from raw text strings, 5simplifying the use of regular expresions. 6 7 8# Index 9 10`number()` 11`string()` 12`column()` 13`coords()` 14`element()` 15`isotope()` 16 17 18# Examples 19 20To extract a float value from a string, 21```python 22from aton import txt 23line = 'energy = 500.0 Ry' 24txt.extract.number(line, 'energy') 25# 500.0 (float output) 26``` 27 28To extract a text value, after and before specific strings, 29```python 30line = 'energy = 500.0 Ry were calculated' 31txt.extract.string(line, 'energy', 'were') 32# '500.0 Ry' (String output) 33``` 34 35To extract a value from a specific column, 36```python 37# Name, Energy, Force, Error 38line = 'Testing 1.1 1.2 0.3' 39energy = txt.extract.column(line, 1) 40# '1.1' (String output) 41``` 42 43To extract coordinates, 44```python 45line = ' He 0.10 0.20 0.30 ' 46txt.extract.coords(line) 47# [0.1, 0.2, 0.3] (List of floats) 48``` 49 50To extract chemical elements, 51```python 52line = ' He4 0.10 Ag 0.20 Pb 0.30 ' 53first_element = txt.extract.element(line, 0) 54# 'He4' 55third_element = txt.extract.element(line, 2) 56# 'Pb' 57``` 58 59To split an isotope name into its element and mass number, 60```python 61txt.extract.isotope('He4') 62# ('He', 4) (symbol, mass number) 63txt.extract.isotope('Au') 64# ('Au', 0) (No mass number) 65``` 66 67--- 68""" 69 70 71import re 72import periodictable 73 74 75def number( 76 text:str, 77 name:str='' 78 ) -> float: 79 """Extracts the float value of a given `name` variable from a raw `text`.""" 80 if text == None: 81 return None 82 pattern = re.compile(rf"{name}\s*[:=]?\s*(-?\d+(?:\.\d+)?(?:[eEdD][+\-]?\d+)?)") 83 match = pattern.search(text) 84 if match: 85 return float(match.group(1)) 86 return None 87 88 89def string( 90 text:str, 91 name:str='', 92 stop:str='', 93 strip:bool=True 94 ) -> str: 95 """Extracts the `text` value of a given `name` variable from a raw string. Stops before an optional `stop` string. 96 97 Removes leading and trailing commas by default, change this with `strip = False`. 98 """ 99 pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)") 100 if stop: 101 pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)(?={stop})") 102 match = re.search(pattern, text) 103 if not match: 104 return None 105 result = str(match.group(1)) 106 result = result.strip() 107 if strip: 108 result = result.strip("'") 109 result = result.strip('"') 110 result = result.strip() 111 return result 112 113 114def column( 115 text:str, 116 column:int=0 117 ) -> str: 118 """Extracts the desired `column` index of a given `string` (0 by default).""" 119 if text is None: 120 return None 121 columns = text.split() 122 pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)' 123 if column < len(columns): 124 match = re.match(pattern, columns[column]) 125 if match: 126 return match.group(1) 127 return None 128 129 130def coords(text:str) -> list: 131 """Returns a list with the float coordinates expressed in a given `text` string.""" 132 if text is None: 133 return None 134 columns = re.split(r'[,\s]+', text.strip()) 135 pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)' 136 matches = [] 137 for column in columns: 138 match = re.match(pattern, column) 139 if match: 140 matches.append(float(match.group(1))) 141 return matches 142 143 144def element( 145 text:str, 146 index:int=0, 147 raise_errors=True, 148 ) -> str: 149 """Extract a chemical element from a raw `text` string. 150 151 If there are several elements, you can return a specific `index` match (positive, 0 by default). 152 Allows for standard elements (H, He, Na...) and isotopes (H2, He4...). 153 An error is raised if no valid element or isotope is found; 154 to override this and simply return an empty string instead, set `raise_errors=False`. 155 """ 156 if text is None: 157 return None 158 columns = re.split(r'[,\s]+', text.strip()) 159 pattern = r'\s*([A-Z][a-z]{0,2}\d{0,3})(?=\s|$)' 160 matches = [] 161 for column in columns: 162 match = re.match(pattern, column) 163 if match: 164 matches.append(str(match.group(1))) 165 # We have a list with possible matches. Let's determine which are actual elements. 166 found_elements = [] 167 for candidate in matches: 168 candidate = candidate.strip() 169 try: 170 symbol, mass_number = isotope(candidate) 171 except: # It is not a valid atom 172 continue 173 found_elements.append(candidate) 174 if len(found_elements) == 0: 175 if raise_errors: 176 raise ValueError(f'No valid element nor isotope found in the string:\n{text}') 177 return '' 178 if len(found_elements) <= index: 179 return found_elements[-1] 180 return found_elements[index] 181 182 183def isotope(name:str, raise_errors=True) -> tuple: 184 """Split the `name` of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4). 185 186 The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0). 187 If the element or isotope does not exist, it raises an error; 188 to override this and simply return `('',0)` instead, set `raise_errors=False`. 189 """ 190 name = name.strip("'") 191 name = name.strip('"') 192 name = name.strip() 193 symbol = ''.join(filter(str.isalpha, name)) 194 mass_number = ''.join(filter(str.isdigit, name)) 195 if mass_number: 196 mass_number = int(mass_number) 197 else: 198 mass_number = 0 199 # Check that the element exists 200 if not symbol in [a.symbol for a in [e for e in periodictable.elements]]: 201 if not raise_errors: 202 return ('', 0) 203 raise KeyError(f'Unrecognised element: {symbol}') 204 if mass_number != 0: 205 isotopes = periodictable.elements.symbol(symbol).isotopes 206 if not mass_number in isotopes: 207 if not raise_errors: 208 return ('', 0) 209 raise KeyError(f'Unrecognised isotope: {name}. Allowed mass numbers for {symbol} are: {isotopes}') 210 return symbol, mass_number
76def number( 77 text:str, 78 name:str='' 79 ) -> float: 80 """Extracts the float value of a given `name` variable from a raw `text`.""" 81 if text == None: 82 return None 83 pattern = re.compile(rf"{name}\s*[:=]?\s*(-?\d+(?:\.\d+)?(?:[eEdD][+\-]?\d+)?)") 84 match = pattern.search(text) 85 if match: 86 return float(match.group(1)) 87 return None
Extracts the float value of a given name variable from a raw text.
90def string( 91 text:str, 92 name:str='', 93 stop:str='', 94 strip:bool=True 95 ) -> str: 96 """Extracts the `text` value of a given `name` variable from a raw string. Stops before an optional `stop` string. 97 98 Removes leading and trailing commas by default, change this with `strip = False`. 99 """ 100 pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)") 101 if stop: 102 pattern = re.compile(rf"{name}\s*[:=]?\s*(.*)(?={stop})") 103 match = re.search(pattern, text) 104 if not match: 105 return None 106 result = str(match.group(1)) 107 result = result.strip() 108 if strip: 109 result = result.strip("'") 110 result = result.strip('"') 111 result = result.strip() 112 return result
Extracts the text value of a given name variable from a raw string. Stops before an optional stop string.
Removes leading and trailing commas by default, change this with strip = False.
115def column( 116 text:str, 117 column:int=0 118 ) -> str: 119 """Extracts the desired `column` index of a given `string` (0 by default).""" 120 if text is None: 121 return None 122 columns = text.split() 123 pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)' 124 if column < len(columns): 125 match = re.match(pattern, columns[column]) 126 if match: 127 return match.group(1) 128 return None
131def coords(text:str) -> list: 132 """Returns a list with the float coordinates expressed in a given `text` string.""" 133 if text is None: 134 return None 135 columns = re.split(r'[,\s]+', text.strip()) 136 pattern = r'(-?\d+(?:\.\d+)?(?:[eE][+\-]?\d+)?)' 137 matches = [] 138 for column in columns: 139 match = re.match(pattern, column) 140 if match: 141 matches.append(float(match.group(1))) 142 return matches
Returns a list with the float coordinates expressed in a given text string.
145def element( 146 text:str, 147 index:int=0, 148 raise_errors=True, 149 ) -> str: 150 """Extract a chemical element from a raw `text` string. 151 152 If there are several elements, you can return a specific `index` match (positive, 0 by default). 153 Allows for standard elements (H, He, Na...) and isotopes (H2, He4...). 154 An error is raised if no valid element or isotope is found; 155 to override this and simply return an empty string instead, set `raise_errors=False`. 156 """ 157 if text is None: 158 return None 159 columns = re.split(r'[,\s]+', text.strip()) 160 pattern = r'\s*([A-Z][a-z]{0,2}\d{0,3})(?=\s|$)' 161 matches = [] 162 for column in columns: 163 match = re.match(pattern, column) 164 if match: 165 matches.append(str(match.group(1))) 166 # We have a list with possible matches. Let's determine which are actual elements. 167 found_elements = [] 168 for candidate in matches: 169 candidate = candidate.strip() 170 try: 171 symbol, mass_number = isotope(candidate) 172 except: # It is not a valid atom 173 continue 174 found_elements.append(candidate) 175 if len(found_elements) == 0: 176 if raise_errors: 177 raise ValueError(f'No valid element nor isotope found in the string:\n{text}') 178 return '' 179 if len(found_elements) <= index: 180 return found_elements[-1] 181 return found_elements[index]
Extract a chemical element from a raw text string.
If there are several elements, you can return a specific index match (positive, 0 by default).
Allows for standard elements (H, He, Na...) and isotopes (H2, He4...).
An error is raised if no valid element or isotope is found;
to override this and simply return an empty string instead, set raise_errors=False.
184def isotope(name:str, raise_errors=True) -> tuple: 185 """Split the `name` of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4). 186 187 The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0). 188 If the element or isotope does not exist, it raises an error; 189 to override this and simply return `('',0)` instead, set `raise_errors=False`. 190 """ 191 name = name.strip("'") 192 name = name.strip('"') 193 name = name.strip() 194 symbol = ''.join(filter(str.isalpha, name)) 195 mass_number = ''.join(filter(str.isdigit, name)) 196 if mass_number: 197 mass_number = int(mass_number) 198 else: 199 mass_number = 0 200 # Check that the element exists 201 if not symbol in [a.symbol for a in [e for e in periodictable.elements]]: 202 if not raise_errors: 203 return ('', 0) 204 raise KeyError(f'Unrecognised element: {symbol}') 205 if mass_number != 0: 206 isotopes = periodictable.elements.symbol(symbol).isotopes 207 if not mass_number in isotopes: 208 if not raise_errors: 209 return ('', 0) 210 raise KeyError(f'Unrecognised isotope: {name}. Allowed mass numbers for {symbol} are: {isotopes}') 211 return symbol, mass_number
Split the name of an isotope into the element and the mass number, eg. 'He4' -> ('He', 4).
The isotope will be 0 if only the element name is provided, eg. 'He' -> ('He', 0).
If the element or isotope does not exist, it raises an error;
to override this and simply return ('',0) instead, set raise_errors=False.