Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 36 additions & 33 deletions src/lmflow/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,42 +94,46 @@ def batchlize(examples: list, batch_size: int, random_shuffle: bool):
return dataloader


def read_last_n_lines_large_file(file_path: str, n: int = 10) -> List[str]:
with open(file_path, 'rb') as f:
f.seek(0, os.SEEK_END)
buffer = bytearray()
pointer = f.tell()
while pointer >= 0 and len(buffer.splitlines()) <= n:
f.seek(pointer)
read_byte = f.read(1)
buffer.extend(read_byte)
pointer -= 1
return buffer[::-1].decode('utf-8').splitlines()[-n:]


def read_first_n_lines_large_file(file_path: str, n: int = 10) -> List[str]:
with open(file_path, 'rb') as f:
def preview_file(file_path: str, chars: int = 100):
"""
Returns the first and last specified number of characters from a file
without loading the entire file into memory, working with any file type.

Args:
file_path (str): Path to the file to be previewed
chars (int, optional): Number of characters to show from start and end. Defaults to 100.

Returns:
tuple: (first_chars, last_chars) - The first and last characters from the file
"""
file_size = os.path.getsize(file_path)

with open(file_path, 'r', encoding='utf-8') as f:
first_chars = f.read(chars)

if file_size <= 2 * chars:
return first_chars, ""

last_chunk_position = max(0, file_size - chars)

f.seek(0)
lines = []
for i in range(n):
line = f.readline()
if not line:
break
lines.append(line.decode('utf-8').strip())
return lines
f.seek(last_chunk_position)

last_chars = f.read(chars)

return first_chars, last_chars


def get_dataset_type_fast(file_path: str, max_lines: int = 100) -> Union[str, None]:
def get_dataset_type_fast(file_path: str, max_chars: int = 100) -> Union[str, None]:
'''Get the type values from the first and last n lines of a large json dataset.
'''
lines = []
file_content_preview = []
dataset_type = None
dataset_type_pattern = re.compile(r'[\"\']type[\"\']:\s*[\'\"]([^"]+)[\'\"]')
lines.extend(read_first_n_lines_large_file(file_path, max_lines))
lines.extend(read_last_n_lines_large_file(file_path, max_lines))
for line in lines:
file_content_preview.extend(preview_file(file_path, max_chars))
for content in file_content_preview:
try:
dataset_type = dataset_type_pattern.search(line).group(1)
dataset_type = dataset_type_pattern.search(content).group(1)
break
except AttributeError:
continue
Expand All @@ -139,12 +143,11 @@ def get_dataset_type_fast(file_path: str, max_lines: int = 100) -> Union[str, No
def check_dataset_instances_key_fast(file_path: str, instances_key: str, max_lines: int = 100) -> bool:
'''Check if the dataset instances key matches the instance_key.
'''
lines = []
file_content_preview = []
instance_key_pattern = re.compile(r'[\"\']' + instances_key + r'[\"\']')
lines.extend(read_first_n_lines_large_file(file_path, max_lines))
lines.extend(read_last_n_lines_large_file(file_path, max_lines))
for line in lines:
if instance_key_pattern.search(line):
file_content_preview.extend(preview_file(file_path, max_lines))
for content in file_content_preview:
if instance_key_pattern.search(content):
return True
return False

Expand Down