aton.txt.find

Description

Functions to search for specific content inside text files.

Index

Find and return specific text strings from a file
lines()
between()

Find the (start, end) position of specific strings in a file
pos()
pos_regex()
next_pos()
next_pos_regex()
line_pos()
between_pos()

Examples

To find the lines containing the word 'key', plus an additional line below,

from aton import txt
# with split = False
txt.find.lines(filepath=file, key='key', additional=1)
    # ['line with key 1\nline below first match',
    #  'line with key 2\nline below second match]
# with split = True
txt.find.lines(filepath=file, key='key', additional=1, split=True)
    # ['line with key 1',
    # 'line below first match',
    # 'line with key 2',
    # 'line below second match]

To find the text between the lines containing the words 'first' and 'second',

from aton import txt
txt.find.between(filepath=file, key1='first', key2='second')
    # 'first line\nadditional\nlines\nin\nbetween\nsecond line'

View Source

  1"""
  2# Description
  3
  4Functions to search for specific content inside text files.
  5
  6
  7# Index
  8
  9Find and return specific text strings from a file  
 10`lines()`  
 11`between()`  
 12
 13Find the `(start, end)` position of specific strings in a file  
 14`pos()`  
 15`pos_regex()`  
 16`next_pos()`  
 17`next_pos_regex()`  
 18`line_pos()`  
 19`between_pos()`  
 20
 21
 22# Examples
 23
 24To find the lines containing the word 'key', plus an additional line below,
 25```python
 26from aton import txt
 27# with split = False
 28txt.find.lines(filepath=file, key='key', additional=1)
 29    # ['line with key 1\\nline below first match',
 30    #  'line with key 2\\nline below second match]
 31# with split = True
 32txt.find.lines(filepath=file, key='key', additional=1, split=True)
 33    # ['line with key 1',
 34    # 'line below first match',
 35    # 'line with key 2',
 36    # 'line below second match]
 37```
 38
 39To find the text between the lines containing the words 'first' and 'second',
 40```python
 41from aton import txt
 42txt.find.between(filepath=file, key1='first', key2='second')
 43    # 'first line\\nadditional\\nlines\\nin\\nbetween\\nsecond line'
 44```
 45
 46---
 47"""
 48
 49
 50import mmap
 51import re
 52import aton.file as file
 53
 54
 55def lines(
 56        filepath:str,
 57        key:str,
 58        matches:int=0,
 59        additional:int=0,
 60        split: bool=False,
 61        regex:bool=False
 62    ) -> list:
 63    """Returns a list with the matches containing the `key` string in `filepath`.
 64
 65    If no match is found, returns an empty list.
 66
 67    To use regular expressions in the search, set `regex=True`
 68    (deactivated by default).
 69
 70    The value `matches` specifies the max number of matches to be returned.
 71    Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
 72    or to negative integers to start the search from the end of the file upwards.
 73
 74    The value `additional` specifies the number of additional lines
 75    below the target line that are also returned;
 76    2 to return the found line plus two additional lines below, etc.
 77    Negative values return the specified number of lines before the target line.
 78    The original ordering from the file is preserved.
 79    Defaults to `additional=0`, only returning the target line.
 80    By default, the additional lines are returned
 81    in the same list item as the match separated by a `\\n`,
 82    unless `split=True`, in which case these additional lines
 83    are splitted and added as additional items in the list.
 84    """
 85    file_path = file.get(filepath)
 86    matches_found = []
 87    if regex:
 88        positions = pos_regex(file_path, key, matches)
 89    else:
 90        positions = pos(file_path, key, matches)
 91    with open(file_path, 'r+b') as f:
 92        try:
 93            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
 94        except ValueError:
 95            return []
 96    for start, end in positions:
 97        # Get the positions of the full line containing the match
 98        line_start = mm.rfind(b'\n', 0, start) + 1
 99        line_end = mm.find(b'\n', end, len(mm))
100        # Default values for the start and end of the line
101        if line_start == -1: line_start = 0
102        if line_end == -1: line_end = len(mm)
103        # Adjust the line_end to add additional lines after the match
104        match_start = line_start
105        match_end = line_end
106        if additional > 0:
107            for _ in range(abs(additional)):
108                match_end = mm.find(b'\n', match_end + 1, len(mm)-1)
109                if match_end == -1:
110                    match_end = len(mm)
111                    break
112        elif additional < 0:
113            for _ in range(abs(additional)):
114                match_start = mm.rfind(b'\n', 0, match_start - 1) + 1
115                if match_start == -1:
116                    match_start = 0
117                    break
118        # Save the matched lines
119        matches_found.append(mm[match_start:match_end].decode())
120    if split:
121        splitted_matches_found = []
122        for string in matches_found:
123            splitted_match = string.splitlines()
124            splitted_matches_found.extend(splitted_match)
125        matches_found = splitted_matches_found
126    return matches_found
127
128
129def between(
130        filepath:str,
131        key1:str,
132        key2:str,
133        include_keys:bool=True,
134        match:int=1,
135        regex:bool=False
136    ) -> str:
137    """Returns the content between the lines with `key1` and `key2` in `filepath`.
138
139    Keywords can be at any position within the line.
140    Regular expressions can be used by setting `regex=True`.
141
142    Key lines are omited by default, but can be returned with `include_keys=True`.
143
144    If there is more than one match, only the first one is considered by default;
145    set `match` (int) to specify a particular match (1, 2... 0 is considered as 1!).
146    Use negative numbers to start from the end of the file.
147
148    If no match is found, returns an empty string.
149
150    If `key2` is not found, it returns all the text from `key1` to the end of the file.
151    """
152    file_path = file.get(filepath)
153    start, end = between_pos(file_path, key1, key2, include_keys, match, regex)
154    if (start, end) == (-1, -1):
155        return ''
156    with open(file_path, 'r+b') as f:
157        try:
158            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
159        except ValueError:
160            return ''
161    return (mm[start:end].decode())
162
163
164def pos(
165        filepath,
166        key:str,
167        matches:int=0,
168        ) -> list:
169    """Returns a list with the positions of the `key` in `filepath`.
170
171    If no match is found, returns an empty list.
172
173    The `filepath` can be a file or a memory mapped file.
174
175    The value `matches` specifies the max number of matches to return.
176    Defaults to 0 to return all possible matches.
177    Set it to 1 to return only one match,
178    2 to get the first two matches, etc.
179    You can also set it to negative integers to start
180    searching from the end of the file upwards.
181
182
183
184    This method is faster than `pos_regex()`,
185    but does not search for regular expressions.
186    """
187    positions = []
188    mm = filepath
189    if not isinstance(filepath, mmap.mmap):
190        file_path = file.get(filepath)
191        with open(file_path, 'r+b') as f:
192            try:
193                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
194            except ValueError:
195                return [(-1, -1)]        ######################   TODO Should it return this if the file is empty?
196    keyword_bytes = key.encode()
197    if matches >= 0:
198        start = 0
199        while matches == 0 or len(positions) < matches:
200            pos = mm.find(keyword_bytes, start)
201            if pos == -1:
202                break
203            end = pos + len(keyword_bytes)
204            positions.append((pos, end))
205            start = end
206    else:
207        start = len(mm)
208        while len(positions) < abs(matches):
209            pos = mm.rfind(keyword_bytes, 0, start)
210            if pos == -1:
211                break
212            end = pos + len(keyword_bytes)
213            positions.append((pos, end))
214            start = pos
215        positions.reverse()
216    return positions
217
218
219def pos_regex(
220        filepath,
221        key:str,
222        matches:int=0
223    ) -> list:
224    """Returns a list of the positions of a `key` in a given `filepath` (actual file, not mmapped!).
225
226    The value `matches` specifies the max number of matches to return.
227    Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
228    or to negative integers to start searching from the end of the file upwards.
229
230    For big files, this method is slower than `pos()`, but it can search for regular expressions.
231    """
232    file_path = file.get(filepath)
233    positions = []
234    with open(file_path, 'r', encoding='utf-8') as f:
235        content = f.read()
236    if matches > 0:
237        start = 0
238        while len(positions) < matches:
239            match = re.search(key, content[start:], flags=re.MULTILINE)  # MULTILINE for ^ regex
240            if not match:
241                break
242            match_start = start + match.start()
243            match_end = start + match.end()
244            positions.append((match_start, match_end))
245            start = match_end
246    else:
247        all_matches = list(re.finditer(key, content, flags=re.MULTILINE))
248        if matches == 0:
249            positions = [(match.start(), match.end()) for match in all_matches]
250        else:
251            positions = [(match.start(), match.end()) for match in all_matches[-abs(matches):]]
252    return positions
253
254
255def next_pos(
256        filepath,
257        position:tuple,
258        key:str,
259        match:int=1
260    ) -> tuple:
261    """Get the next position of the `key` in the `filepath` (file or mmapped file), starting from an initial `position` tuple.
262
263    The `match` number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!).
264    It can be negative to search backwards from the initial position.
265    The last known positions will be returned if no more matches are found.
266
267    This method is specific for normal strings.
268    To use regular expressions, check `next_pos_regex()`.
269    """
270    mm = filepath
271    if not isinstance(filepath, mmap.mmap):
272        file_path = file.get(filepath)
273        with open(file_path, 'r+b') as f:
274            try:
275                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
276            except ValueError:                ###########  TODO  what should this return?? 
277                return (-1, -1)
278    start, end = position
279    keyword_bytes = key.encode()
280    if match == 0:
281        match = 1
282    positions = []
283    if match > 0:
284        for _ in range(match):
285            start = mm.find(keyword_bytes, end, len(mm))
286            if start == -1:
287                break
288            end = start + len(keyword_bytes)
289            positions.append((start, end))
290    else:
291        for _ in range(abs(match)):
292            start = mm.rfind(keyword_bytes, 0, start)
293            if start == -1:
294                break
295            end = start + len(keyword_bytes)
296            positions.append((start, end))
297    positions.reverse()
298    if len(positions) == 0:               ###########  TODO  what should this return?? 
299        positions.append((-1, -1))
300    return positions[0]                     #####  should return empty if not found?
301
302
303def next_pos_regex(
304        filepath,
305        position:tuple,
306        key:str,
307        match:int=0
308    ) -> tuple:
309    """Get the next position of the `key` in the `filepath` (actual file, not mmapped!), starting from an initial `position` tuple.
310
311    The `match` number specifies the next match to return (1, 2... 0 is considered as 1!).
312    It can be negative to search backwards from the initial position.
313    This method is specific for regular expressions.
314
315    For normal strings, check the faster `next_pos()` method.
316    """
317    file_path = file.get(filepath)
318    start, end = position
319    with open(file_path, 'r') as f:
320        content = f.read()
321    if match == 0:
322        match = 1
323    positions = []
324    if match > 0:
325        for _ in range(match):
326            match_found = re.search(key, content[end:])
327            if not match_found:
328                break
329            start = end + match_found.start()
330            end = end + match_found.end()
331            positions.append((start, end))
332        positions.reverse()
333        if len(positions) == 0:  ###########  when pos did not find, it was []  !! should be same here?
334            positions.append((-1, -1))
335        return positions[0]
336    else:  # Reverse match
337        all_matches = list(re.finditer(key, content))
338        if not all_matches:  ###########   when pos did not find, it was []  !! should be same here?
339            return (-1, -1)
340        if abs(match) > len(all_matches):
341            match = -len(all_matches)
342        else:
343            match_found = all_matches[match]  # Already negative
344            start = match_found.start()
345            end = match_found.end()
346    return start, end
347
348
349def line_pos(
350        filepath,
351        position:tuple,
352        skips:int=0
353    ) -> tuple:
354    """Get the position of the full line containing the `position` tuple in `filepath` (whether file or memory mapped file).
355
356    A specific line below can be returned with `skips` being a natural int,
357    or previous lines with negative values.
358    """
359    mm = filepath
360    if not isinstance(filepath, mmap.mmap):
361        file_path = file.get(filepath)
362        with open(file_path, 'r+b') as f:
363            try:
364                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
365            except ValueError:
366                return (-1, -1)
367    if position == (-1, -1):  # No match
368        return (-1, -1)
369    start, end = position
370    if skips == 0:
371        start = mm.rfind(b'\n', 0, start) + 1
372        end = mm.find(b'\n', end, len(mm))
373    elif skips > 0:
374        for i in range(0, abs(skips)):
375            start = mm.find(b'\n', end, len(mm)) + 1
376            if start == -1:
377                start = len(mm)
378                end = len(mm)
379                break
380            end = mm.find(b'\n', start, len(mm))
381            if end == -1:
382                start = len(mm)
383                end = len(mm)
384                break
385    else:  # previous lines
386        for i in range(0, abs(skips)):
387            end = mm.rfind(b'\n', 0, start)
388            if end == -1:
389                start = 0
390                end = 0
391                break
392            start = mm.rfind(b'\n', 0, end) + 1
393            if start == -1:
394                start = 0
395                end = 0
396                break
397    return start, end
398
399
400def between_pos(
401        filepath,
402        key1:str,
403        key2:str,
404        include_keys:bool=True,
405        match:int=1,
406        regex:bool=False
407    ) -> tuple:
408    """Returns the positions of the content between the lines containing `key1` and `key2` in the `filepath`.
409
410    Keywords can be at any position within the line.
411    Regular expressions can be used by setting `regex=True`.
412
413    Key lines are omited by default, but can be returned with `include_keys=True`.
414
415    If there is more than one match, only the first one is considered by default;
416    set `match` number to specify a particular match (1, 2... 0 is considered as 1!).
417    Use negative numbers to start from the end of the file.
418
419    If `key2` is not found, it returns the text position from `key1` to the end of the file.
420    """
421    file_path = file.get(filepath)
422    if match == 0:
423        match = 1
424    if regex:
425        positions_1: list = pos_regex(file_path, key1, match)
426        if not positions_1:
427            return (-1, -1)
428        if match > 0:
429            positions_1.reverse()
430        position_1 = positions_1[0]
431        position_2: tuple = next_pos_regex(file_path, position_1, key2, 1)
432    else:
433        positions_1: list = pos(file_path, key1, match)
434        if not positions_1:
435            return (-1, -1)
436        if match > 0:
437            positions_1.reverse()
438        position_1 = positions_1[0]
439        position_2: tuple = next_pos(file_path, position_1, key2, 1)
440    skip_line_1 = 0
441    skip_line_2 = 0
442    if not include_keys:
443        skip_line_1 = 1
444        skip_line_2 = -1
445    with open(file_path, 'r+b') as f:
446        try:
447            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
448        except ValueError:
449            return (-1, -1)
450    start, _ = line_pos(mm, position_1, skip_line_1)
451    if position_2 != (-1, -1):
452        _, end = line_pos(mm, position_2, skip_line_2)
453    else:
454        end = len(mm)
455    return start, end

def lines( filepath: str, key: str, matches: int = 0, additional: int = 0, split: bool = False, regex: bool = False) -> list: View Source

 56def lines(
 57        filepath:str,
 58        key:str,
 59        matches:int=0,
 60        additional:int=0,
 61        split: bool=False,
 62        regex:bool=False
 63    ) -> list:
 64    """Returns a list with the matches containing the `key` string in `filepath`.
 65
 66    If no match is found, returns an empty list.
 67
 68    To use regular expressions in the search, set `regex=True`
 69    (deactivated by default).
 70
 71    The value `matches` specifies the max number of matches to be returned.
 72    Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
 73    or to negative integers to start the search from the end of the file upwards.
 74
 75    The value `additional` specifies the number of additional lines
 76    below the target line that are also returned;
 77    2 to return the found line plus two additional lines below, etc.
 78    Negative values return the specified number of lines before the target line.
 79    The original ordering from the file is preserved.
 80    Defaults to `additional=0`, only returning the target line.
 81    By default, the additional lines are returned
 82    in the same list item as the match separated by a `\\n`,
 83    unless `split=True`, in which case these additional lines
 84    are splitted and added as additional items in the list.
 85    """
 86    file_path = file.get(filepath)
 87    matches_found = []
 88    if regex:
 89        positions = pos_regex(file_path, key, matches)
 90    else:
 91        positions = pos(file_path, key, matches)
 92    with open(file_path, 'r+b') as f:
 93        try:
 94            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
 95        except ValueError:
 96            return []
 97    for start, end in positions:
 98        # Get the positions of the full line containing the match
 99        line_start = mm.rfind(b'\n', 0, start) + 1
100        line_end = mm.find(b'\n', end, len(mm))
101        # Default values for the start and end of the line
102        if line_start == -1: line_start = 0
103        if line_end == -1: line_end = len(mm)
104        # Adjust the line_end to add additional lines after the match
105        match_start = line_start
106        match_end = line_end
107        if additional > 0:
108            for _ in range(abs(additional)):
109                match_end = mm.find(b'\n', match_end + 1, len(mm)-1)
110                if match_end == -1:
111                    match_end = len(mm)
112                    break
113        elif additional < 0:
114            for _ in range(abs(additional)):
115                match_start = mm.rfind(b'\n', 0, match_start - 1) + 1
116                if match_start == -1:
117                    match_start = 0
118                    break
119        # Save the matched lines
120        matches_found.append(mm[match_start:match_end].decode())
121    if split:
122        splitted_matches_found = []
123        for string in matches_found:
124            splitted_match = string.splitlines()
125            splitted_matches_found.extend(splitted_match)
126        matches_found = splitted_matches_found
127    return matches_found

Returns a list with the matches containing the key string in filepath.

If no match is found, returns an empty list.

To use regular expressions in the search, set regex=True (deactivated by default).

The value matches specifies the max number of matches to be returned. Defaults to 0 to return all possible matches. Set it to 1 to return only one match, or to negative integers to start the search from the end of the file upwards.

The value additional specifies the number of additional lines below the target line that are also returned; 2 to return the found line plus two additional lines below, etc. Negative values return the specified number of lines before the target line. The original ordering from the file is preserved. Defaults to additional=0, only returning the target line. By default, the additional lines are returned in the same list item as the match separated by a \n, unless split=True, in which case these additional lines are splitted and added as additional items in the list.

def between( filepath: str, key1: str, key2: str, include_keys: bool = True, match: int = 1, regex: bool = False) -> str: View Source

130def between(
131        filepath:str,
132        key1:str,
133        key2:str,
134        include_keys:bool=True,
135        match:int=1,
136        regex:bool=False
137    ) -> str:
138    """Returns the content between the lines with `key1` and `key2` in `filepath`.
139
140    Keywords can be at any position within the line.
141    Regular expressions can be used by setting `regex=True`.
142
143    Key lines are omited by default, but can be returned with `include_keys=True`.
144
145    If there is more than one match, only the first one is considered by default;
146    set `match` (int) to specify a particular match (1, 2... 0 is considered as 1!).
147    Use negative numbers to start from the end of the file.
148
149    If no match is found, returns an empty string.
150
151    If `key2` is not found, it returns all the text from `key1` to the end of the file.
152    """
153    file_path = file.get(filepath)
154    start, end = between_pos(file_path, key1, key2, include_keys, match, regex)
155    if (start, end) == (-1, -1):
156        return ''
157    with open(file_path, 'r+b') as f:
158        try:
159            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
160        except ValueError:
161            return ''
162    return (mm[start:end].decode())

Returns the content between the lines with key1 and key2 in filepath.

Keywords can be at any position within the line. Regular expressions can be used by setting regex=True.

Key lines are omited by default, but can be returned with include_keys=True.

If there is more than one match, only the first one is considered by default; set match (int) to specify a particular match (1, 2... 0 is considered as 1!). Use negative numbers to start from the end of the file.

If no match is found, returns an empty string.

If key2 is not found, it returns all the text from key1 to the end of the file.

def pos(filepath, key: str, matches: int = 0) -> list: View Source

165def pos(
166        filepath,
167        key:str,
168        matches:int=0,
169        ) -> list:
170    """Returns a list with the positions of the `key` in `filepath`.
171
172    If no match is found, returns an empty list.
173
174    The `filepath` can be a file or a memory mapped file.
175
176    The value `matches` specifies the max number of matches to return.
177    Defaults to 0 to return all possible matches.
178    Set it to 1 to return only one match,
179    2 to get the first two matches, etc.
180    You can also set it to negative integers to start
181    searching from the end of the file upwards.
182
183
184
185    This method is faster than `pos_regex()`,
186    but does not search for regular expressions.
187    """
188    positions = []
189    mm = filepath
190    if not isinstance(filepath, mmap.mmap):
191        file_path = file.get(filepath)
192        with open(file_path, 'r+b') as f:
193            try:
194                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
195            except ValueError:
196                return [(-1, -1)]        ######################   TODO Should it return this if the file is empty?
197    keyword_bytes = key.encode()
198    if matches >= 0:
199        start = 0
200        while matches == 0 or len(positions) < matches:
201            pos = mm.find(keyword_bytes, start)
202            if pos == -1:
203                break
204            end = pos + len(keyword_bytes)
205            positions.append((pos, end))
206            start = end
207    else:
208        start = len(mm)
209        while len(positions) < abs(matches):
210            pos = mm.rfind(keyword_bytes, 0, start)
211            if pos == -1:
212                break
213            end = pos + len(keyword_bytes)
214            positions.append((pos, end))
215            start = pos
216        positions.reverse()
217    return positions

Returns a list with the positions of the key in filepath.

If no match is found, returns an empty list.

The filepath can be a file or a memory mapped file.

The value matches specifies the max number of matches to return. Defaults to 0 to return all possible matches. Set it to 1 to return only one match, 2 to get the first two matches, etc. You can also set it to negative integers to start searching from the end of the file upwards.

This method is faster than pos_regex(), but does not search for regular expressions.

def pos_regex(filepath, key: str, matches: int = 0) -> list: View Source

220def pos_regex(
221        filepath,
222        key:str,
223        matches:int=0
224    ) -> list:
225    """Returns a list of the positions of a `key` in a given `filepath` (actual file, not mmapped!).
226
227    The value `matches` specifies the max number of matches to return.
228    Defaults to 0 to return all possible matches. Set it to 1 to return only one match,
229    or to negative integers to start searching from the end of the file upwards.
230
231    For big files, this method is slower than `pos()`, but it can search for regular expressions.
232    """
233    file_path = file.get(filepath)
234    positions = []
235    with open(file_path, 'r', encoding='utf-8') as f:
236        content = f.read()
237    if matches > 0:
238        start = 0
239        while len(positions) < matches:
240            match = re.search(key, content[start:], flags=re.MULTILINE)  # MULTILINE for ^ regex
241            if not match:
242                break
243            match_start = start + match.start()
244            match_end = start + match.end()
245            positions.append((match_start, match_end))
246            start = match_end
247    else:
248        all_matches = list(re.finditer(key, content, flags=re.MULTILINE))
249        if matches == 0:
250            positions = [(match.start(), match.end()) for match in all_matches]
251        else:
252            positions = [(match.start(), match.end()) for match in all_matches[-abs(matches):]]
253    return positions

Returns a list of the positions of a key in a given filepath (actual file, not mmapped!).

The value matches specifies the max number of matches to return. Defaults to 0 to return all possible matches. Set it to 1 to return only one match, or to negative integers to start searching from the end of the file upwards.

For big files, this method is slower than pos(), but it can search for regular expressions.

def next_pos(filepath, position: tuple, key: str, match: int = 1) -> tuple: View Source

256def next_pos(
257        filepath,
258        position:tuple,
259        key:str,
260        match:int=1
261    ) -> tuple:
262    """Get the next position of the `key` in the `filepath` (file or mmapped file), starting from an initial `position` tuple.
263
264    The `match` number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!).
265    It can be negative to search backwards from the initial position.
266    The last known positions will be returned if no more matches are found.
267
268    This method is specific for normal strings.
269    To use regular expressions, check `next_pos_regex()`.
270    """
271    mm = filepath
272    if not isinstance(filepath, mmap.mmap):
273        file_path = file.get(filepath)
274        with open(file_path, 'r+b') as f:
275            try:
276                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
277            except ValueError:                ###########  TODO  what should this return?? 
278                return (-1, -1)
279    start, end = position
280    keyword_bytes = key.encode()
281    if match == 0:
282        match = 1
283    positions = []
284    if match > 0:
285        for _ in range(match):
286            start = mm.find(keyword_bytes, end, len(mm))
287            if start == -1:
288                break
289            end = start + len(keyword_bytes)
290            positions.append((start, end))
291    else:
292        for _ in range(abs(match)):
293            start = mm.rfind(keyword_bytes, 0, start)
294            if start == -1:
295                break
296            end = start + len(keyword_bytes)
297            positions.append((start, end))
298    positions.reverse()
299    if len(positions) == 0:               ###########  TODO  what should this return?? 
300        positions.append((-1, -1))
301    return positions[0]                     #####  should return empty if not found?

Get the next position of the key in the filepath (file or mmapped file), starting from an initial position tuple.

The match number specifies the nonzero index of the next match to return (1, 2... 0 is considered as 1!). It can be negative to search backwards from the initial position. The last known positions will be returned if no more matches are found.

This method is specific for normal strings. To use regular expressions, check next_pos_regex().

def next_pos_regex(filepath, position: tuple, key: str, match: int = 0) -> tuple: View Source

304def next_pos_regex(
305        filepath,
306        position:tuple,
307        key:str,
308        match:int=0
309    ) -> tuple:
310    """Get the next position of the `key` in the `filepath` (actual file, not mmapped!), starting from an initial `position` tuple.
311
312    The `match` number specifies the next match to return (1, 2... 0 is considered as 1!).
313    It can be negative to search backwards from the initial position.
314    This method is specific for regular expressions.
315
316    For normal strings, check the faster `next_pos()` method.
317    """
318    file_path = file.get(filepath)
319    start, end = position
320    with open(file_path, 'r') as f:
321        content = f.read()
322    if match == 0:
323        match = 1
324    positions = []
325    if match > 0:
326        for _ in range(match):
327            match_found = re.search(key, content[end:])
328            if not match_found:
329                break
330            start = end + match_found.start()
331            end = end + match_found.end()
332            positions.append((start, end))
333        positions.reverse()
334        if len(positions) == 0:  ###########  when pos did not find, it was []  !! should be same here?
335            positions.append((-1, -1))
336        return positions[0]
337    else:  # Reverse match
338        all_matches = list(re.finditer(key, content))
339        if not all_matches:  ###########   when pos did not find, it was []  !! should be same here?
340            return (-1, -1)
341        if abs(match) > len(all_matches):
342            match = -len(all_matches)
343        else:
344            match_found = all_matches[match]  # Already negative
345            start = match_found.start()
346            end = match_found.end()
347    return start, end

Get the next position of the key in the filepath (actual file, not mmapped!), starting from an initial position tuple.

The match number specifies the next match to return (1, 2... 0 is considered as 1!). It can be negative to search backwards from the initial position. This method is specific for regular expressions.

For normal strings, check the faster next_pos() method.

def line_pos(filepath, position: tuple, skips: int = 0) -> tuple: View Source

350def line_pos(
351        filepath,
352        position:tuple,
353        skips:int=0
354    ) -> tuple:
355    """Get the position of the full line containing the `position` tuple in `filepath` (whether file or memory mapped file).
356
357    A specific line below can be returned with `skips` being a natural int,
358    or previous lines with negative values.
359    """
360    mm = filepath
361    if not isinstance(filepath, mmap.mmap):
362        file_path = file.get(filepath)
363        with open(file_path, 'r+b') as f:
364            try:
365                mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
366            except ValueError:
367                return (-1, -1)
368    if position == (-1, -1):  # No match
369        return (-1, -1)
370    start, end = position
371    if skips == 0:
372        start = mm.rfind(b'\n', 0, start) + 1
373        end = mm.find(b'\n', end, len(mm))
374    elif skips > 0:
375        for i in range(0, abs(skips)):
376            start = mm.find(b'\n', end, len(mm)) + 1
377            if start == -1:
378                start = len(mm)
379                end = len(mm)
380                break
381            end = mm.find(b'\n', start, len(mm))
382            if end == -1:
383                start = len(mm)
384                end = len(mm)
385                break
386    else:  # previous lines
387        for i in range(0, abs(skips)):
388            end = mm.rfind(b'\n', 0, start)
389            if end == -1:
390                start = 0
391                end = 0
392                break
393            start = mm.rfind(b'\n', 0, end) + 1
394            if start == -1:
395                start = 0
396                end = 0
397                break
398    return start, end

Get the position of the full line containing the position tuple in filepath (whether file or memory mapped file).

A specific line below can be returned with skips being a natural int, or previous lines with negative values.

def between_pos( filepath, key1: str, key2: str, include_keys: bool = True, match: int = 1, regex: bool = False) -> tuple: View Source

401def between_pos(
402        filepath,
403        key1:str,
404        key2:str,
405        include_keys:bool=True,
406        match:int=1,
407        regex:bool=False
408    ) -> tuple:
409    """Returns the positions of the content between the lines containing `key1` and `key2` in the `filepath`.
410
411    Keywords can be at any position within the line.
412    Regular expressions can be used by setting `regex=True`.
413
414    Key lines are omited by default, but can be returned with `include_keys=True`.
415
416    If there is more than one match, only the first one is considered by default;
417    set `match` number to specify a particular match (1, 2... 0 is considered as 1!).
418    Use negative numbers to start from the end of the file.
419
420    If `key2` is not found, it returns the text position from `key1` to the end of the file.
421    """
422    file_path = file.get(filepath)
423    if match == 0:
424        match = 1
425    if regex:
426        positions_1: list = pos_regex(file_path, key1, match)
427        if not positions_1:
428            return (-1, -1)
429        if match > 0:
430            positions_1.reverse()
431        position_1 = positions_1[0]
432        position_2: tuple = next_pos_regex(file_path, position_1, key2, 1)
433    else:
434        positions_1: list = pos(file_path, key1, match)
435        if not positions_1:
436            return (-1, -1)
437        if match > 0:
438            positions_1.reverse()
439        position_1 = positions_1[0]
440        position_2: tuple = next_pos(file_path, position_1, key2, 1)
441    skip_line_1 = 0
442    skip_line_2 = 0
443    if not include_keys:
444        skip_line_1 = 1
445        skip_line_2 = -1
446    with open(file_path, 'r+b') as f:
447        try:
448            mm = mmap.mmap(f.fileno(), length=0, access=mmap.ACCESS_READ)
449        except ValueError:
450            return (-1, -1)
451    start, _ = line_pos(mm, position_1, skip_line_1)
452    if position_2 != (-1, -1):
453        _, end = line_pos(mm, position_2, skip_line_2)
454    else:
455        end = len(mm)
456    return start, end

Returns the positions of the content between the lines containing key1 and key2 in the filepath.

Keywords can be at any position within the line. Regular expressions can be used by setting regex=True.

Key lines are omited by default, but can be returned with include_keys=True.

If there is more than one match, only the first one is considered by default; set match number to specify a particular match (1, 2... 0 is considered as 1!). Use negative numbers to start from the end of the file.

If key2 is not found, it returns the text position from key1 to the end of the file.