OptimalScale · research4pan · Mar 18, 2025 · Mar 17, 2025
diff --git a/src/lmflow/utils/data_utils.py b/src/lmflow/utils/data_utils.py
@@ -94,42 +94,46 @@ def batchlize(examples: list, batch_size: int, random_shuffle: bool):
     return dataloader
 
 
-def read_last_n_lines_large_file(file_path: str, n: int = 10) -> List[str]:
-    with open(file_path, 'rb') as f:
-        f.seek(0, os.SEEK_END)
-        buffer = bytearray()
-        pointer = f.tell()
-        while pointer >= 0 and len(buffer.splitlines()) <= n:
-            f.seek(pointer)
-            read_byte = f.read(1)
-            buffer.extend(read_byte)
-            pointer -= 1
-        return buffer[::-1].decode('utf-8').splitlines()[-n:]
-
-
-def read_first_n_lines_large_file(file_path: str, n: int = 10) -> List[str]:
-    with open(file_path, 'rb') as f:
+def preview_file(file_path: str, chars: int = 100):
+    """
+    Returns the first and last specified number of characters from a file
+    without loading the entire file into memory, working with any file type.
+
+    Args:
+        file_path (str): Path to the file to be previewed
+        chars (int, optional): Number of characters to show from start and end. Defaults to 100.
+
+    Returns:
+        tuple: (first_chars, last_chars) - The first and last characters from the file
+    """
+    file_size = os.path.getsize(file_path)
+
+    with open(file_path, 'r', encoding='utf-8') as f:
+        first_chars = f.read(chars)
+
+        if file_size <= 2 * chars:
+            return first_chars, ""
+
+        last_chunk_position = max(0, file_size - chars)
+
         f.seek(0)
-        lines = []
-        for i in range(n):
-            line = f.readline()
-            if not line:
-                break
-            lines.append(line.decode('utf-8').strip())
-        return lines
+        f.seek(last_chunk_position)
+
+        last_chars = f.read(chars)
+
+    return first_chars, last_chars
 
 
-def get_dataset_type_fast(file_path: str, max_lines: int = 100) -> Union[str, None]:
+def get_dataset_type_fast(file_path: str, max_chars: int = 100) -> Union[str, None]:
     '''Get the type values from the first and last n lines of a large json dataset.
     '''
-    lines = []
+    file_content_preview = []
     dataset_type = None
     dataset_type_pattern = re.compile(r'[\"\']type[\"\']:\s*[\'\"]([^"]+)[\'\"]')
-    lines.extend(read_first_n_lines_large_file(file_path, max_lines))
-    lines.extend(read_last_n_lines_large_file(file_path, max_lines))
-    for line in lines:
+    file_content_preview.extend(preview_file(file_path, max_chars))
+    for content in file_content_preview:
         try:
-            dataset_type = dataset_type_pattern.search(line).group(1)
+            dataset_type = dataset_type_pattern.search(content).group(1)
             break
         except AttributeError:
             continue
@@ -139,12 +143,11 @@ def get_dataset_type_fast(file_path: str, max_lines: int = 100) -> Union[str, No
 def check_dataset_instances_key_fast(file_path: str, instances_key: str, max_lines: int = 100) -> bool:
     '''Check if the dataset instances key matches the instance_key.
     '''
-    lines = []
+    file_content_preview = []
     instance_key_pattern = re.compile(r'[\"\']' + instances_key + r'[\"\']')
-    lines.extend(read_first_n_lines_large_file(file_path, max_lines))
-    lines.extend(read_last_n_lines_large_file(file_path, max_lines))
-    for line in lines:
-        if instance_key_pattern.search(line):
+    file_content_preview.extend(preview_file(file_path, max_lines))
+    for content in file_content_preview:
+        if instance_key_pattern.search(content):
             return True
     return False