Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions man_spider/lib/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,26 @@ def extractous(self, file, pretty_filename):
if not self.match_magic(file):
return matches

text_content, metadata = self.extractor.extract_file_to_string(str(file))
# XML files: read raw to preserve attributes and handle encoding
if suffix == '.xml':
log.debug(f'Parsing raw XML for {pretty_filename}')
try:
# read raw bytes and detect BOM for encoding
with open(file, 'rb') as f:
raw_bytes = f.read()
if raw_bytes.startswith(b'\xff\xfe') or raw_bytes.startswith(b'\xfe\xff'):
text_content = raw_bytes.decode('utf-16', errors='ignore')
elif raw_bytes.startswith(b'\xef\xbb\bf'):
text_content = raw_bytes.decode('utf-8-sig', errors='ignore')
else:
text_content = raw_bytes.decode('utf-8', errors='ignore')
metadata = {}
except Exception as e:
log.warning(f"Error reading raw XML for {pretty_filename}: {e}")
return matches
else:
# non-XML: extract text via extractous
text_content, metadata = self.extractor.extract_file_to_string(str(file))

# try to convert to UTF-8 for grep-friendliness
try:
Expand All @@ -162,4 +181,4 @@ def extractous(self, file, pretty_filename):
if not self.quiet:
self.grep(binary_content, _filter.pattern)

return matches
return matches