Skip to content

Commit 4d5b51a

Browse files
committed
Update Document segmentation for CLI
1 parent 36e957e commit 4d5b51a

File tree

5 files changed

+260
-96
lines changed

5 files changed

+260
-96
lines changed

cli/cli_app.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@ def __init__(self):
3737
self.app = None # Will be initialized by workflow adapter
3838
self.logger = None
3939
self.context = None
40+
# Document segmentation configuration
41+
self.segmentation_config = {"enabled": True, "size_threshold_chars": 50000}
4042

4143
async def initialize_mcp_app(self):
4244
"""初始化MCP应用 - 使用工作流适配器"""
@@ -47,9 +49,51 @@ async def cleanup_mcp_app(self):
4749
"""清理MCP应用 - 使用工作流适配器"""
4850
await self.workflow_adapter.cleanup_mcp_app()
4951

52+
def update_segmentation_config(self):
53+
"""Update document segmentation configuration in mcp_agent.config.yaml"""
54+
import yaml
55+
import os
56+
57+
config_path = os.path.join(
58+
os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
59+
"mcp_agent.config.yaml",
60+
)
61+
62+
try:
63+
# Read current config
64+
with open(config_path, "r", encoding="utf-8") as f:
65+
config = yaml.safe_load(f)
66+
67+
# Update document segmentation settings
68+
if "document_segmentation" not in config:
69+
config["document_segmentation"] = {}
70+
71+
config["document_segmentation"]["enabled"] = self.segmentation_config[
72+
"enabled"
73+
]
74+
config["document_segmentation"]["size_threshold_chars"] = (
75+
self.segmentation_config["size_threshold_chars"]
76+
)
77+
78+
# Write updated config
79+
with open(config_path, "w", encoding="utf-8") as f:
80+
yaml.dump(config, f, default_flow_style=False, allow_unicode=True)
81+
82+
self.cli.print_status(
83+
"📄 Document segmentation configuration updated", "success"
84+
)
85+
86+
except Exception as e:
87+
self.cli.print_status(
88+
f"⚠️ Failed to update segmentation config: {str(e)}", "warning"
89+
)
90+
5091
async def process_input(self, input_source: str, input_type: str):
5192
"""处理输入源(URL或文件)- 使用升级版智能体编排引擎"""
5293
try:
94+
# Update segmentation configuration before processing
95+
self.update_segmentation_config()
96+
5397
self.cli.print_separator()
5498
self.cli.print_status(
5599
"🚀 Starting intelligent agent orchestration...", "processing"
@@ -237,8 +281,20 @@ async def run_interactive_session(self):
237281
self.cli.show_history()
238282

239283
elif choice in ["c", "config", "configure"]:
284+
# Sync current segmentation config from CLI interface
285+
self.segmentation_config["enabled"] = self.cli.segmentation_enabled
286+
self.segmentation_config["size_threshold_chars"] = (
287+
self.cli.segmentation_threshold
288+
)
289+
240290
self.cli.show_configuration_menu()
241291

292+
# Sync back from CLI interface after configuration changes
293+
self.segmentation_config["enabled"] = self.cli.segmentation_enabled
294+
self.segmentation_config["size_threshold_chars"] = (
295+
self.cli.segmentation_threshold
296+
)
297+
242298
else:
243299
self.cli.print_status(
244300
"Invalid choice. Please select U, F, T, C, H, or Q.", "warning"

cli/cli_interface.py

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ def __init__(self):
4040
self.is_running = True
4141
self.processing_history = []
4242
self.enable_indexing = True # Default configuration
43+
self.segmentation_enabled = True # Default to smart segmentation
44+
self.segmentation_threshold = 50000 # Default threshold
4345

4446
# Check tkinter availability for file dialogs
4547
self.tkinter_available = True
@@ -125,6 +127,9 @@ def create_menu(self):
125127
# Display current configuration
126128
pipeline_mode = "🧠 COMPREHENSIVE" if self.enable_indexing else "⚡ OPTIMIZED"
127129
index_status = "✅ Enabled" if self.enable_indexing else "🔶 Disabled"
130+
segmentation_mode = (
131+
"📄 SMART" if self.segmentation_enabled else "📋 TRADITIONAL"
132+
)
128133

129134
menu = f"""
130135
{Colors.BOLD}{Colors.CYAN}╔═══════════════════════════════════════════════════════════════════════════════╗
@@ -135,6 +140,7 @@ def create_menu(self):
135140
║ ║
136141
{Colors.BOLD}🤖 Current Pipeline Mode: {pipeline_mode}{Colors.CYAN}
137142
{Colors.BOLD}🗂️ Codebase Indexing: {index_status}{Colors.CYAN}
143+
{Colors.BOLD}📄 Document Processing: {segmentation_mode}{Colors.CYAN}
138144
║ ║
139145
{Colors.YELLOW}📝 URL Processing:{Colors.CYAN}
140146
{Colors.YELLOW} ▶ Enter research paper URL (arXiv, IEEE, ACM, etc.) {Colors.CYAN}
@@ -693,6 +699,11 @@ def show_history(self):
693699
def show_configuration_menu(self):
694700
"""Show configuration options menu"""
695701
self.clear_screen()
702+
703+
# Get segmentation config status
704+
segmentation_enabled = getattr(self, "segmentation_enabled", True)
705+
segmentation_threshold = getattr(self, "segmentation_threshold", 50000)
706+
696707
print(f"""
697708
{Colors.BOLD}{Colors.CYAN}╔═══════════════════════════════════════════════════════════════════════════════╗
698709
║ CONFIGURATION MENU ║
@@ -716,9 +727,23 @@ def show_configuration_menu(self):
716727
║ ✗ Repository Acquisition (Skipped) ║
717728
║ ✗ Codebase Intelligence Orchestration (Skipped) ║
718729
║ ║
719-
{Colors.YELLOW}Current Setting:{Colors.CYAN} {'🧠 Comprehensive Mode' if self.enable_indexing else '⚡ Optimized Mode'}
730+
{Colors.OKCYAN}[2] Document Processing:{Colors.CYAN}
731+
{Colors.BOLD}📄 Smart Segmentation{Colors.CYAN} - Intelligent document analysis (Default) ║
732+
║ ✓ Semantic boundary detection ║
733+
║ ✓ Algorithm integrity preservation ║
734+
║ ✓ Formula chain recognition ║
735+
║ ✓ Adaptive character limits ║
736+
║ ║
737+
{Colors.BOLD}📋 Traditional Processing{Colors.CYAN} - Full document reading ║
738+
║ ✓ Complete document analysis ║
739+
║ ✗ Smart segmentation (Disabled) ║
720740
║ ║
721-
{Colors.OKGREEN}[T] Toggle Pipeline Mode {Colors.CYAN}{Colors.FAIL}[B] Back to Main Menu{Colors.CYAN}
741+
{Colors.YELLOW}Current Settings:{Colors.CYAN}
742+
║ Pipeline: {'🧠 Comprehensive Mode' if self.enable_indexing else '⚡ Optimized Mode'}
743+
║ Document: {'📄 Smart Segmentation' if segmentation_enabled else '📋 Traditional Processing'}
744+
║ Threshold: {segmentation_threshold} characters ║
745+
║ ║
746+
{Colors.OKGREEN}[T] Toggle Pipeline {Colors.BLUE}[S] Toggle Segmentation {Colors.FAIL}[B] Back{Colors.CYAN}
722747
╚═══════════════════════════════════════════════════════════════════════════════╝{Colors.ENDC}
723748
""")
724749

@@ -737,8 +762,25 @@ def show_configuration_menu(self):
737762
self.show_configuration_menu()
738763
return
739764

765+
elif choice in ["s", "segmentation"]:
766+
current_state = getattr(self, "segmentation_enabled", True)
767+
self.segmentation_enabled = not current_state
768+
seg_mode = (
769+
"📄 Smart Segmentation"
770+
if self.segmentation_enabled
771+
else "📋 Traditional Processing"
772+
)
773+
self.print_status(
774+
f"Document processing switched to: {seg_mode}", "success"
775+
)
776+
time.sleep(1)
777+
self.show_configuration_menu()
778+
return
779+
740780
elif choice in ["b", "back"]:
741781
return
742782

743783
else:
744-
self.print_status("Invalid choice. Please enter 'T' or 'B'.", "warning")
784+
self.print_status(
785+
"Invalid choice. Please enter 'T', 'S', or 'B'.", "warning"
786+
)

cli/main_cli.py

Lines changed: 42 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,15 +94,21 @@ def parse_arguments():
9494
formatter_class=argparse.RawDescriptionHelpFormatter,
9595
epilog=f"""
9696
{Colors.BOLD}Examples:{Colors.ENDC}
97-
{Colors.CYAN}python main_cli.py{Colors.ENDC} # Interactive mode
98-
{Colors.CYAN}python main_cli.py --file paper.pdf{Colors.ENDC} # Process file directly
99-
{Colors.CYAN}python main_cli.py --url https://...{Colors.ENDC} # Process URL directly
100-
{Colors.CYAN}python main_cli.py --chat "Build a web app..."{Colors.ENDC} # Process chat requirements
101-
{Colors.CYAN}python main_cli.py --optimized{Colors.ENDC} # Use optimized mode
97+
{Colors.CYAN}python main_cli.py{Colors.ENDC} # Interactive mode
98+
{Colors.CYAN}python main_cli.py --file paper.pdf{Colors.ENDC} # Process file directly
99+
{Colors.CYAN}python main_cli.py --url https://...{Colors.ENDC} # Process URL directly
100+
{Colors.CYAN}python main_cli.py --chat "Build a web app..."{Colors.ENDC} # Process chat requirements
101+
{Colors.CYAN}python main_cli.py --optimized{Colors.ENDC} # Use optimized mode
102+
{Colors.CYAN}python main_cli.py --disable-segmentation{Colors.ENDC} # Disable document segmentation
103+
{Colors.CYAN}python main_cli.py --segmentation-threshold 30000{Colors.ENDC} # Custom segmentation threshold
102104
103105
{Colors.BOLD}Pipeline Modes:{Colors.ENDC}
104106
{Colors.GREEN}Comprehensive{Colors.ENDC}: Full intelligence analysis with indexing
105107
{Colors.YELLOW}Optimized{Colors.ENDC}: Fast processing without indexing
108+
109+
{Colors.BOLD}Document Processing:{Colors.ENDC}
110+
{Colors.BLUE}Smart Segmentation{Colors.ENDC}: Intelligent document segmentation for large papers
111+
{Colors.MAGENTA}Supported Formats{Colors.ENDC}: PDF, DOCX, DOC, PPT, PPTX, XLS, XLSX, HTML, TXT, MD
106112
""",
107113
)
108114

@@ -128,6 +134,19 @@ def parse_arguments():
128134
help="Use optimized mode (skip indexing for faster processing)",
129135
)
130136

137+
parser.add_argument(
138+
"--disable-segmentation",
139+
action="store_true",
140+
help="Disable intelligent document segmentation (use traditional full-document processing)",
141+
)
142+
143+
parser.add_argument(
144+
"--segmentation-threshold",
145+
type=int,
146+
default=50000,
147+
help="Document size threshold (characters) to trigger segmentation (default: 50000)",
148+
)
149+
131150
parser.add_argument(
132151
"--verbose", "-v", action="store_true", help="Enable verbose output"
133152
)
@@ -206,6 +225,24 @@ async def main():
206225
f"\n{Colors.GREEN}🧠 Comprehensive mode enabled - full intelligence analysis{Colors.ENDC}"
207226
)
208227

228+
# Configure document segmentation settings
229+
if hasattr(args, "disable_segmentation") and args.disable_segmentation:
230+
print(
231+
f"\n{Colors.MAGENTA}📄 Document segmentation disabled - using traditional processing{Colors.ENDC}"
232+
)
233+
app.segmentation_config = {
234+
"enabled": False,
235+
"size_threshold_chars": args.segmentation_threshold,
236+
}
237+
else:
238+
print(
239+
f"\n{Colors.BLUE}📄 Smart document segmentation enabled (threshold: {args.segmentation_threshold} chars){Colors.ENDC}"
240+
)
241+
app.segmentation_config = {
242+
"enabled": True,
243+
"size_threshold_chars": args.segmentation_threshold,
244+
}
245+
209246
# 检查是否为直接处理模式
210247
if args.file or args.url or args.chat:
211248
if args.file:

0 commit comments

Comments
 (0)