blacklanternsecurity · ADScanPro · Jul 8, 2025
diff --git a/man_spider/lib/parser/parser.py b/man_spider/lib/parser/parser.py
@@ -141,7 +141,26 @@ def extractous(self, file, pretty_filename):
         if not self.match_magic(file):
             return matches
 
-        text_content, metadata = self.extractor.extract_file_to_string(str(file))
+        # XML files: read raw to preserve attributes and handle encoding
+        if suffix == '.xml':
+            log.debug(f'Parsing raw XML for {pretty_filename}')
+            try:
+                # read raw bytes and detect BOM for encoding
+                with open(file, 'rb') as f:
+                    raw_bytes = f.read()
+                if raw_bytes.startswith(b'\xff\xfe') or raw_bytes.startswith(b'\xfe\xff'):
+                    text_content = raw_bytes.decode('utf-16', errors='ignore')
+                elif raw_bytes.startswith(b'\xef\xbb\bf'):
+                    text_content = raw_bytes.decode('utf-8-sig', errors='ignore')
+                else:
+                    text_content = raw_bytes.decode('utf-8', errors='ignore')
+                metadata = {}
+            except Exception as e:
+                log.warning(f"Error reading raw XML for {pretty_filename}: {e}")
+                return matches
+        else:
+            # non-XML: extract text via extractous
+            text_content, metadata = self.extractor.extract_file_to_string(str(file))
 
         # try to convert to UTF-8 for grep-friendliness
         try:
@@ -162,4 +181,4 @@ def extractous(self, file, pretty_filename):
             if not self.quiet:
                 self.grep(binary_content, _filter.pattern)
 
-        return matches
+        return matches