aton.txt.find
Description
Functions to search for specific content inside text files.
Index
Find and return specific text strings from a file
lines()
between()
Find the (start, end) position of specific strings in a file
pos()
pos_regex()
next_pos()
next_pos_regex()
line_pos()
between_pos()
Examples
To find the lines containing the word 'key', plus an additional line below,
from aton import txt
# with split = False
txt.find.lines(filepath=file, key='key', additional=1)
# ['line with key 1\nline below first match',
# 'line with key 2\nline below second match]
# with split = True
txt.find.lines(filepath=file, key='key', additional=1, split=True)
# ['line with key 1',
# 'line below first match',
# 'line with key 2',
# 'line below second match]
To find the text between the lines containing the words 'first' and 'second',
from aton import txt
txt.find.between(filepath=file, key1='first', key2='second')
# 'first line\nadditional\nlines\nin\nbetween\nsecond line'
1""" 2# Description 3 4Functions to search for specific content inside text files. 5 6 7# Index 8 9Find and return specific text strings from a file 10`lines()` 11`between()` 12 13Find the `(start, end)` position of specific strings in a file 14`pos()` 15`pos_regex()` 16`next_pos()` 17`next_pos_regex()` 18`line_pos()` 19`between_pos()` 20 21 22# Examples 23 24To find the lines containing the word 'key', plus an additional line below, 25```python 26from aton import txt 27# with split = False 28txt.find.lines(filepath=file, key='key', additional=1) 29 # ['line with key 1\\nline below first match', 30 # 'line with key 2\\nline below second match] 31# with split = True 32txt.find.lines(filepath=file, key='key', additional=1, split=True) 33 # ['line with key 1', 34 # 'line below first match', 35 # 'line with key 2', 36 # 'line below second match] 37``` 38 39To find the text between the lines containing the words 'first' and 'second', 40```python 41from aton import txt 42txt.find.between(filepath=file, key1='first', key2='second') 43 # 'first line\\nadditional\\nlines\\nin\\nbetween\\nsecond line' 44``` 45 46--- 47""" 48 49 50import mmap 51import re 52import aton.file as file 53 54 55def lines( 56 filepath:str, 57 key:str, 58 matches:int=0, 59 additional:int=0, 60 split: bool=False, 61 regex:bool=False 62 ) -> list: 63 """Returns a list with the matches containing the `key` string in `filepath`. 64 65 If no match is found, returns an empty list. 66 67 To use regular expressions in the search, set `regex=True` 68 (deactivated by default). 69 70 The value `matches` specifies the max number of matches to be returned. 71 Defaults to 0 to return all possible matches. Set it to 1 to return only one match, 72 or to negative integers to start the search from the end of the file upwards. 73 74 The value `additional` specifies the number of additional lines 75 below the target line that are also returned; 76 2 to return the found line plus two additional lines below, etc. 77 Negative values return the specified number of lines before the target line. 78 The original ordering from the file is preserved. 79 Defaults to `additional=0`, only returning the target line. 80 By default, the additional lines are returned 81 in the same list item as the match separated by a `\\n`, 82 unless `split=True`, in which case these additional lines 83 are splitted and added as additional items in the list. 84 """ 85 file_path = file.get(filepath) 86 matches_found = [] 87 if regex: 88 positions = pos_regex(file_path, key, matches) 89 else: 90 positions = pos(file_path, key, matches) 91 with open(file_path, 'r+b') as f: 92 try: 93 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 94 except ValueError: 95 return [] 96 for start, end in positions: 97 # Get the positions of the full line containing the match 98 line_start = mm.rfind(b'\n', 0, start) + 1 99 line_end = mm.find(b'\n', end, len(mm)) 100 # Default values for the start and end of the line 101 if line_start == -1: line_start = 0 102 if line_end == -1: line_end = len(mm) 103 # Adjust the line_end to add additional lines after the match 104 match_start = line_start 105 match_end = line_end 106 if additional > 0: 107 for _ in range(abs(additional)): 108 match_end = mm.find(b'\n', match_end + 1, len(mm)-1) 109 if match_end == -1: 110 match_end = len(mm) 111 break 112 elif additional < 0: 113 for _ in range(abs(additional)): 114 match_start = mm.rfind(b'\n', 0, match_start - 1) + 1 115 if match_start == -1: 116 match_start = 0 117 break 118 # Save the matched lines 119 matches_found.append(mm[match_start:match_end].decode()) 120 if split: 121 splitted_matches_found = [] 122 for string in matches_found: 123 splitted_match = string.splitlines() 124 splitted_matches_found.extend(splitted_match) 125 matches_found = splitted_matches_found 126 return matches_found 127 128 129def between( 130 filepath:str, 131 key1:str, 132 key2:str, 133 include_keys:bool=True, 134 match:int=1, 135 regex:bool=False 136 ) -> str: 137 """Returns the content between the lines with `key1` and `key2` in `filepath`. 138 139 Keywords can be at any position within the line. 140 Regular expressions can be used by setting `regex=True`. 141 142 Key lines are omited by default, but can be returned with `include_keys=True`. 143 144 If there is more than one match, only the first one is considered by default; 145 set `match` (int) to specify a particular match (1, 2... 0 is considered as 1!). 146 Use negative numbers to start from the end of the file. 147 148 If no match is found, returns an empty string. 149 150 If `key2` is not found, it returns all the text from `key1` to the end of the file. 151 """ 152 file_path = file.get(filepath) 153 start, end = between_pos(file_path, key1, key2, include_keys, match, regex) 154 if (start, end) == (-1, -1): 155 return '' 156 with open(file_path, 'r+b') as f: 157 try: 158 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 159 except ValueError: 160 return '' 161 return (mm[start:end].decode()) 162 163 164def pos( 165 filepath, 166 key:str, 167 matches:int=0, 168 ) -> list: 169 """Returns a list with the positions of the `key` in `filepath`. 170 171 If no match is found, returns an empty list. 172 173 The `filepath` can be a file or a memory mapped file. 174 175 The value `matches` specifies the max number of matches to return. 176 Defaults to 0 to return all possible matches. 177 Set it to 1 to return only one match, 178 2 to get the first two matches, etc. 179 You can also set it to negative integers to start 180 searching from the end of the file upwards. 181 182 183 184 This method is faster than `pos_regex()`, 185 but does not search for regular expressions. 186 """ 187 positions = [] 188 mm = filepath 189 if not isinstance(filepath, mmap.mmap): 190 file_path = file.get(filepath) 191 with open(file_path, 'r+b') as f: 192 try: 193 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 194 except ValueError: 195 return [(-1, -1)] ###################### TODO Should it return this if the file is empty? 196 keyword_bytes = key.encode() 197 if matches >= 0: 198 start = 0 199 while matches == 0 or len(positions) < matches: 200 pos = mm.find(keyword_bytes, start) 201 if pos == -1: 202 break 203 end = pos + len(keyword_bytes) 204 positions.append((pos, end)) 205 start = end 206 else: 207 start = len(mm) 208 while len(positions) < abs(matches): 209 pos = mm.rfind(keyword_bytes, 0, start) 210 if pos == -1: 211 break 212 end = pos + len(keyword_bytes) 213 positions.append((pos, end)) 214 start = pos 215 positions.reverse() 216 return positions 217 218 219def pos_regex( 220 filepath, 221 key:str, 222 matches:int=0 223 ) -> list: 224 """Returns a list of the positions of a `key` in a given `filepath` (actual file, not mmapped!). 225 226 The value `matches` specifies the max number of matches to return. 227 Defaults to 0 to return all possible matches. Set it to 1 to return only one match, 228 or to negative integers to start searching from the end of the file upwards. 229 230 For big files, this method is slower than `pos()`, but it can search for regular expressions. 231 """ 232 file_path = file.get(filepath) 233 positions = [] 234 with open(file_path, 'r', encoding='utf-8') as f: 235 content = f.read() 236 if matches > 0: 237 start = 0 238 while len(positions) < matches: 239 match = re.search(key, content[start:], flags=re.MULTILINE) # MULTILINE for ^ regex 240 if not match: 241 break 242 match_start = start + match.start() 243 match_end = start + match.end() 244 positions.append((match_start, match_end)) 245 start = match_end 246 else: 247 all_matches = list(re.finditer(key, content, flags=re.MULTILINE)) 248 if matches == 0: 249 positions = [(match.start(), match.end()) for match in all_matches] 250 else: 251 positions = [(match.start(), match.end()) for match in all_matches[-abs(matches):]] 252 return positions 253 254 255def next_pos( 256 filepath, 257 position:tuple, 258 key:str, 259 match:int=1 260 ) -> tuple: 261 """Get the next position of the `key` in the `filepath` (file or mmapped file), starting from an initial `position` tuple. 262 263 The `match` number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!). 264 It can be negative to search backwards from the initial position. 265 The last known positions will be returned if no more matches are found. 266 267 This method is specific for normal strings. 268 To use regular expressions, check `next_pos_regex()`. 269 """ 270 mm = filepath 271 if not isinstance(filepath, mmap.mmap): 272 file_path = file.get(filepath) 273 with open(file_path, 'r+b') as f: 274 try: 275 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 276 except ValueError: ########### TODO what should this return?? 277 return (-1, -1) 278 start, end = position 279 keyword_bytes = key.encode() 280 if match == 0: 281 match = 1 282 positions = [] 283 if match > 0: 284 for _ in range(match): 285 start = mm.find(keyword_bytes, end, len(mm)) 286 if start == -1: 287 break 288 end = start + len(keyword_bytes) 289 positions.append((start, end)) 290 else: 291 for _ in range(abs(match)): 292 start = mm.rfind(keyword_bytes, 0, start) 293 if start == -1: 294 break 295 end = start + len(keyword_bytes) 296 positions.append((start, end)) 297 positions.reverse() 298 if len(positions) == 0: ########### TODO what should this return?? 299 positions.append((-1, -1)) 300 return positions[0] ##### should return empty if not found? 301 302 303def next_pos_regex( 304 filepath, 305 position:tuple, 306 key:str, 307 match:int=0 308 ) -> tuple: 309 """Get the next position of the `key` in the `filepath` (actual file, not mmapped!), starting from an initial `position` tuple. 310 311 The `match` number specifies the next match to return (1, 2... 0 is considered as 1!). 312 It can be negative to search backwards from the initial position. 313 This method is specific for regular expressions. 314 315 For normal strings, check the faster `next_pos()` method. 316 """ 317 file_path = file.get(filepath) 318 start, end = position 319 with open(file_path, 'r') as f: 320 content = f.read() 321 if match == 0: 322 match = 1 323 positions = [] 324 if match > 0: 325 for _ in range(match): 326 match_found = re.search(key, content[end:]) 327 if not match_found: 328 break 329 start = end + match_found.start() 330 end = end + match_found.end() 331 positions.append((start, end)) 332 positions.reverse() 333 if len(positions) == 0: ########### when pos did not find, it was [] !! should be same here? 334 positions.append((-1, -1)) 335 return positions[0] 336 else: # Reverse match 337 all_matches = list(re.finditer(key, content)) 338 if not all_matches: ########### when pos did not find, it was [] !! should be same here? 339 return (-1, -1) 340 if abs(match) > len(all_matches): 341 match = -len(all_matches) 342 else: 343 match_found = all_matches[match] # Already negative 344 start = match_found.start() 345 end = match_found.end() 346 return start, end 347 348 349def line_pos( 350 filepath, 351 position:tuple, 352 skips:int=0 353 ) -> tuple: 354 """Get the position of the full line containing the `position` tuple in `filepath` (whether file or memory mapped file). 355 356 A specific line below can be returned with `skips` being a natural int, 357 or previous lines with negative values. 358 """ 359 mm = filepath 360 if not isinstance(filepath, mmap.mmap): 361 file_path = file.get(filepath) 362 with open(file_path, 'r+b') as f: 363 try: 364 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 365 except ValueError: 366 return (-1, -1) 367 if position == (-1, -1): # No match 368 return (-1, -1) 369 start, end = position 370 if skips == 0: 371 start = mm.rfind(b'\n', 0, start) + 1 372 end = mm.find(b'\n', end, len(mm)) 373 elif skips > 0: 374 for i in range(0, abs(skips)): 375 start = mm.find(b'\n', end, len(mm)) + 1 376 if start == -1: 377 start = len(mm) 378 end = len(mm) 379 break 380 end = mm.find(b'\n', start, len(mm)) 381 if end == -1: 382 start = len(mm) 383 end = len(mm) 384 break 385 else: # previous lines 386 for i in range(0, abs(skips)): 387 end = mm.rfind(b'\n', 0, start) 388 if end == -1: 389 start = 0 390 end = 0 391 break 392 start = mm.rfind(b'\n', 0, end) + 1 393 if start == -1: 394 start = 0 395 end = 0 396 break 397 return start, end 398 399 400def between_pos( 401 filepath, 402 key1:str, 403 key2:str, 404 include_keys:bool=True, 405 match:int=1, 406 regex:bool=False 407 ) -> tuple: 408 """Returns the positions of the content between the lines containing `key1` and `key2` in the `filepath`. 409 410 Keywords can be at any position within the line. 411 Regular expressions can be used by setting `regex=True`. 412 413 Key lines are omited by default, but can be returned with `include_keys=True`. 414 415 If there is more than one match, only the first one is considered by default; 416 set `match` number to specify a particular match (1, 2... 0 is considered as 1!). 417 Use negative numbers to start from the end of the file. 418 419 If `key2` is not found, it returns the text position from `key1` to the end of the file. 420 """ 421 file_path = file.get(filepath) 422 if match == 0: 423 match = 1 424 if regex: 425 positions_1: list = pos_regex(file_path, key1, match) 426 if not positions_1: 427 return (-1, -1) 428 if match > 0: 429 positions_1.reverse() 430 position_1 = positions_1[0] 431 position_2: tuple = next_pos_regex(file_path, position_1, key2, 1) 432 else: 433 positions_1: list = pos(file_path, key1, match) 434 if not positions_1: 435 return (-1, -1) 436 if match > 0: 437 positions_1.reverse() 438 position_1 = positions_1[0] 439 position_2: tuple = next_pos(file_path, position_1, key2, 1) 440 skip_line_1 = 0 441 skip_line_2 = 0 442 if not include_keys: 443 skip_line_1 = 1 444 skip_line_2 = -1 445 with open(file_path, 'r+b') as f: 446 try: 447 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 448 except ValueError: 449 return (-1, -1) 450 start, _ = line_pos(mm, position_1, skip_line_1) 451 if position_2 != (-1, -1): 452 _, end = line_pos(mm, position_2, skip_line_2) 453 else: 454 end = len(mm) 455 return start, end
56def lines( 57 filepath:str, 58 key:str, 59 matches:int=0, 60 additional:int=0, 61 split: bool=False, 62 regex:bool=False 63 ) -> list: 64 """Returns a list with the matches containing the `key` string in `filepath`. 65 66 If no match is found, returns an empty list. 67 68 To use regular expressions in the search, set `regex=True` 69 (deactivated by default). 70 71 The value `matches` specifies the max number of matches to be returned. 72 Defaults to 0 to return all possible matches. Set it to 1 to return only one match, 73 or to negative integers to start the search from the end of the file upwards. 74 75 The value `additional` specifies the number of additional lines 76 below the target line that are also returned; 77 2 to return the found line plus two additional lines below, etc. 78 Negative values return the specified number of lines before the target line. 79 The original ordering from the file is preserved. 80 Defaults to `additional=0`, only returning the target line. 81 By default, the additional lines are returned 82 in the same list item as the match separated by a `\\n`, 83 unless `split=True`, in which case these additional lines 84 are splitted and added as additional items in the list. 85 """ 86 file_path = file.get(filepath) 87 matches_found = [] 88 if regex: 89 positions = pos_regex(file_path, key, matches) 90 else: 91 positions = pos(file_path, key, matches) 92 with open(file_path, 'r+b') as f: 93 try: 94 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 95 except ValueError: 96 return [] 97 for start, end in positions: 98 # Get the positions of the full line containing the match 99 line_start = mm.rfind(b'\n', 0, start) + 1 100 line_end = mm.find(b'\n', end, len(mm)) 101 # Default values for the start and end of the line 102 if line_start == -1: line_start = 0 103 if line_end == -1: line_end = len(mm) 104 # Adjust the line_end to add additional lines after the match 105 match_start = line_start 106 match_end = line_end 107 if additional > 0: 108 for _ in range(abs(additional)): 109 match_end = mm.find(b'\n', match_end + 1, len(mm)-1) 110 if match_end == -1: 111 match_end = len(mm) 112 break 113 elif additional < 0: 114 for _ in range(abs(additional)): 115 match_start = mm.rfind(b'\n', 0, match_start - 1) + 1 116 if match_start == -1: 117 match_start = 0 118 break 119 # Save the matched lines 120 matches_found.append(mm[match_start:match_end].decode()) 121 if split: 122 splitted_matches_found = [] 123 for string in matches_found: 124 splitted_match = string.splitlines() 125 splitted_matches_found.extend(splitted_match) 126 matches_found = splitted_matches_found 127 return matches_found
Returns a list with the matches containing the key string in filepath.
If no match is found, returns an empty list.
To use regular expressions in the search, set regex=True
(deactivated by default).
The value matches specifies the max number of matches to be returned.
Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
or to negative integers to start the search from the end of the file upwards.
The value additional specifies the number of additional lines
below the target line that are also returned;
2 to return the found line plus two additional lines below, etc.
Negative values return the specified number of lines before the target line.
The original ordering from the file is preserved.
Defaults to additional=0, only returning the target line.
By default, the additional lines are returned
in the same list item as the match separated by a \n,
unless split=True, in which case these additional lines
are splitted and added as additional items in the list.
130def between( 131 filepath:str, 132 key1:str, 133 key2:str, 134 include_keys:bool=True, 135 match:int=1, 136 regex:bool=False 137 ) -> str: 138 """Returns the content between the lines with `key1` and `key2` in `filepath`. 139 140 Keywords can be at any position within the line. 141 Regular expressions can be used by setting `regex=True`. 142 143 Key lines are omited by default, but can be returned with `include_keys=True`. 144 145 If there is more than one match, only the first one is considered by default; 146 set `match` (int) to specify a particular match (1, 2... 0 is considered as 1!). 147 Use negative numbers to start from the end of the file. 148 149 If no match is found, returns an empty string. 150 151 If `key2` is not found, it returns all the text from `key1` to the end of the file. 152 """ 153 file_path = file.get(filepath) 154 start, end = between_pos(file_path, key1, key2, include_keys, match, regex) 155 if (start, end) == (-1, -1): 156 return '' 157 with open(file_path, 'r+b') as f: 158 try: 159 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 160 except ValueError: 161 return '' 162 return (mm[start:end].decode())
Returns the content between the lines with key1 and key2 in filepath.
Keywords can be at any position within the line.
Regular expressions can be used by setting regex=True.
Key lines are omited by default, but can be returned with include_keys=True.
If there is more than one match, only the first one is considered by default;
set match (int) to specify a particular match (1, 2... 0 is considered as 1!).
Use negative numbers to start from the end of the file.
If no match is found, returns an empty string.
If key2 is not found, it returns all the text from key1 to the end of the file.
165def pos( 166 filepath, 167 key:str, 168 matches:int=0, 169 ) -> list: 170 """Returns a list with the positions of the `key` in `filepath`. 171 172 If no match is found, returns an empty list. 173 174 The `filepath` can be a file or a memory mapped file. 175 176 The value `matches` specifies the max number of matches to return. 177 Defaults to 0 to return all possible matches. 178 Set it to 1 to return only one match, 179 2 to get the first two matches, etc. 180 You can also set it to negative integers to start 181 searching from the end of the file upwards. 182 183 184 185 This method is faster than `pos_regex()`, 186 but does not search for regular expressions. 187 """ 188 positions = [] 189 mm = filepath 190 if not isinstance(filepath, mmap.mmap): 191 file_path = file.get(filepath) 192 with open(file_path, 'r+b') as f: 193 try: 194 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 195 except ValueError: 196 return [(-1, -1)] ###################### TODO Should it return this if the file is empty? 197 keyword_bytes = key.encode() 198 if matches >= 0: 199 start = 0 200 while matches == 0 or len(positions) < matches: 201 pos = mm.find(keyword_bytes, start) 202 if pos == -1: 203 break 204 end = pos + len(keyword_bytes) 205 positions.append((pos, end)) 206 start = end 207 else: 208 start = len(mm) 209 while len(positions) < abs(matches): 210 pos = mm.rfind(keyword_bytes, 0, start) 211 if pos == -1: 212 break 213 end = pos + len(keyword_bytes) 214 positions.append((pos, end)) 215 start = pos 216 positions.reverse() 217 return positions
Returns a list with the positions of the key in filepath.
If no match is found, returns an empty list.
The filepath can be a file or a memory mapped file.
The value matches specifies the max number of matches to return.
Defaults to 0 to return all possible matches.
Set it to 1 to return only one match,
2 to get the first two matches, etc.
You can also set it to negative integers to start
searching from the end of the file upwards.
This method is faster than pos_regex(),
but does not search for regular expressions.
220def pos_regex( 221 filepath, 222 key:str, 223 matches:int=0 224 ) -> list: 225 """Returns a list of the positions of a `key` in a given `filepath` (actual file, not mmapped!). 226 227 The value `matches` specifies the max number of matches to return. 228 Defaults to 0 to return all possible matches. Set it to 1 to return only one match, 229 or to negative integers to start searching from the end of the file upwards. 230 231 For big files, this method is slower than `pos()`, but it can search for regular expressions. 232 """ 233 file_path = file.get(filepath) 234 positions = [] 235 with open(file_path, 'r', encoding='utf-8') as f: 236 content = f.read() 237 if matches > 0: 238 start = 0 239 while len(positions) < matches: 240 match = re.search(key, content[start:], flags=re.MULTILINE) # MULTILINE for ^ regex 241 if not match: 242 break 243 match_start = start + match.start() 244 match_end = start + match.end() 245 positions.append((match_start, match_end)) 246 start = match_end 247 else: 248 all_matches = list(re.finditer(key, content, flags=re.MULTILINE)) 249 if matches == 0: 250 positions = [(match.start(), match.end()) for match in all_matches] 251 else: 252 positions = [(match.start(), match.end()) for match in all_matches[-abs(matches):]] 253 return positions
Returns a list of the positions of a key in a given filepath (actual file, not mmapped!).
The value matches specifies the max number of matches to return.
Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
or to negative integers to start searching from the end of the file upwards.
For big files, this method is slower than pos(), but it can search for regular expressions.
256def next_pos( 257 filepath, 258 position:tuple, 259 key:str, 260 match:int=1 261 ) -> tuple: 262 """Get the next position of the `key` in the `filepath` (file or mmapped file), starting from an initial `position` tuple. 263 264 The `match` number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!). 265 It can be negative to search backwards from the initial position. 266 The last known positions will be returned if no more matches are found. 267 268 This method is specific for normal strings. 269 To use regular expressions, check `next_pos_regex()`. 270 """ 271 mm = filepath 272 if not isinstance(filepath, mmap.mmap): 273 file_path = file.get(filepath) 274 with open(file_path, 'r+b') as f: 275 try: 276 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 277 except ValueError: ########### TODO what should this return?? 278 return (-1, -1) 279 start, end = position 280 keyword_bytes = key.encode() 281 if match == 0: 282 match = 1 283 positions = [] 284 if match > 0: 285 for _ in range(match): 286 start = mm.find(keyword_bytes, end, len(mm)) 287 if start == -1: 288 break 289 end = start + len(keyword_bytes) 290 positions.append((start, end)) 291 else: 292 for _ in range(abs(match)): 293 start = mm.rfind(keyword_bytes, 0, start) 294 if start == -1: 295 break 296 end = start + len(keyword_bytes) 297 positions.append((start, end)) 298 positions.reverse() 299 if len(positions) == 0: ########### TODO what should this return?? 300 positions.append((-1, -1)) 301 return positions[0] ##### should return empty if not found?
Get the next position of the key in the filepath (file or mmapped file), starting from an initial position tuple.
The match number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!).
It can be negative to search backwards from the initial position.
The last known positions will be returned if no more matches are found.
This method is specific for normal strings.
To use regular expressions, check next_pos_regex().
304def next_pos_regex( 305 filepath, 306 position:tuple, 307 key:str, 308 match:int=0 309 ) -> tuple: 310 """Get the next position of the `key` in the `filepath` (actual file, not mmapped!), starting from an initial `position` tuple. 311 312 The `match` number specifies the next match to return (1, 2... 0 is considered as 1!). 313 It can be negative to search backwards from the initial position. 314 This method is specific for regular expressions. 315 316 For normal strings, check the faster `next_pos()` method. 317 """ 318 file_path = file.get(filepath) 319 start, end = position 320 with open(file_path, 'r') as f: 321 content = f.read() 322 if match == 0: 323 match = 1 324 positions = [] 325 if match > 0: 326 for _ in range(match): 327 match_found = re.search(key, content[end:]) 328 if not match_found: 329 break 330 start = end + match_found.start() 331 end = end + match_found.end() 332 positions.append((start, end)) 333 positions.reverse() 334 if len(positions) == 0: ########### when pos did not find, it was [] !! should be same here? 335 positions.append((-1, -1)) 336 return positions[0] 337 else: # Reverse match 338 all_matches = list(re.finditer(key, content)) 339 if not all_matches: ########### when pos did not find, it was [] !! should be same here? 340 return (-1, -1) 341 if abs(match) > len(all_matches): 342 match = -len(all_matches) 343 else: 344 match_found = all_matches[match] # Already negative 345 start = match_found.start() 346 end = match_found.end() 347 return start, end
Get the next position of the key in the filepath (actual file, not mmapped!), starting from an initial position tuple.
The match number specifies the next match to return (1, 2... 0 is considered as 1!).
It can be negative to search backwards from the initial position.
This method is specific for regular expressions.
For normal strings, check the faster next_pos() method.
350def line_pos( 351 filepath, 352 position:tuple, 353 skips:int=0 354 ) -> tuple: 355 """Get the position of the full line containing the `position` tuple in `filepath` (whether file or memory mapped file). 356 357 A specific line below can be returned with `skips` being a natural int, 358 or previous lines with negative values. 359 """ 360 mm = filepath 361 if not isinstance(filepath, mmap.mmap): 362 file_path = file.get(filepath) 363 with open(file_path, 'r+b') as f: 364 try: 365 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 366 except ValueError: 367 return (-1, -1) 368 if position == (-1, -1): # No match 369 return (-1, -1) 370 start, end = position 371 if skips == 0: 372 start = mm.rfind(b'\n', 0, start) + 1 373 end = mm.find(b'\n', end, len(mm)) 374 elif skips > 0: 375 for i in range(0, abs(skips)): 376 start = mm.find(b'\n', end, len(mm)) + 1 377 if start == -1: 378 start = len(mm) 379 end = len(mm) 380 break 381 end = mm.find(b'\n', start, len(mm)) 382 if end == -1: 383 start = len(mm) 384 end = len(mm) 385 break 386 else: # previous lines 387 for i in range(0, abs(skips)): 388 end = mm.rfind(b'\n', 0, start) 389 if end == -1: 390 start = 0 391 end = 0 392 break 393 start = mm.rfind(b'\n', 0, end) + 1 394 if start == -1: 395 start = 0 396 end = 0 397 break 398 return start, end
Get the position of the full line containing the position tuple in filepath (whether file or memory mapped file).
A specific line below can be returned with skips being a natural int,
or previous lines with negative values.
401def between_pos( 402 filepath, 403 key1:str, 404 key2:str, 405 include_keys:bool=True, 406 match:int=1, 407 regex:bool=False 408 ) -> tuple: 409 """Returns the positions of the content between the lines containing `key1` and `key2` in the `filepath`. 410 411 Keywords can be at any position within the line. 412 Regular expressions can be used by setting `regex=True`. 413 414 Key lines are omited by default, but can be returned with `include_keys=True`. 415 416 If there is more than one match, only the first one is considered by default; 417 set `match` number to specify a particular match (1, 2... 0 is considered as 1!). 418 Use negative numbers to start from the end of the file. 419 420 If `key2` is not found, it returns the text position from `key1` to the end of the file. 421 """ 422 file_path = file.get(filepath) 423 if match == 0: 424 match = 1 425 if regex: 426 positions_1: list = pos_regex(file_path, key1, match) 427 if not positions_1: 428 return (-1, -1) 429 if match > 0: 430 positions_1.reverse() 431 position_1 = positions_1[0] 432 position_2: tuple = next_pos_regex(file_path, position_1, key2, 1) 433 else: 434 positions_1: list = pos(file_path, key1, match) 435 if not positions_1: 436 return (-1, -1) 437 if match > 0: 438 positions_1.reverse() 439 position_1 = positions_1[0] 440 position_2: tuple = next_pos(file_path, position_1, key2, 1) 441 skip_line_1 = 0 442 skip_line_2 = 0 443 if not include_keys: 444 skip_line_1 = 1 445 skip_line_2 = -1 446 with open(file_path, 'r+b') as f: 447 try: 448 mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ) 449 except ValueError: 450 return (-1, -1) 451 start, _ = line_pos(mm, position_1, skip_line_1) 452 if position_2 != (-1, -1): 453 _, end = line_pos(mm, position_2, skip_line_2) 454 else: 455 end = len(mm) 456 return start, end
Returns the positions of the content between the lines containing key1 and key2 in the filepath.
Keywords can be at any position within the line.
Regular expressions can be used by setting regex=True.
Key lines are omited by default, but can be returned with include_keys=True.
If there is more than one match, only the first one is considered by default;
set match number to specify a particular match (1, 2... 0 is considered as 1!).
Use negative numbers to start from the end of the file.
If key2 is not found, it returns the text position from key1 to the end of the file.