@@ -187,3 +187,44 @@ def generate_image_url(image_path):
187187 else :
188188 base_url = getattr (settings , "API_URL" , "http://localhost:7091" )
189189 return f"{ base_url } /api/images/{ image_path } "
190+
191+
192+ def clean_text_for_tts (text : str ) -> str :
193+ """
194+ clean text for Text-to-Speech processing.
195+ """
196+ # Handle code blocks and links
197+ text = re .sub (r'```mermaid[\s\S]*?```' , ' flowchart, ' , text ) ## ```mermaid...```
198+ text = re .sub (r'```[\s\S]*?```' , ' code block, ' , text ) ## ```code```
199+ text = re .sub (r'\[([^\]]+)\]\([^\)]+\)' , r'\1' , text ) ## [text](url)
200+ text = re .sub (r'!\[([^\]]*)\]\([^\)]+\)' , '' , text ) ## 
201+
202+ # Remove markdown formatting
203+ text = re .sub (r'`([^`]+)`' , r'\1' , text ) ## `code`
204+ text = re .sub (r'\{([^}]*)\}' , r' \1 ' , text ) ## {text}
205+ text = re .sub (r'[{}]' , ' ' , text ) ## unmatched {}
206+ text = re .sub (r'\[([^\]]+)\]' , r' \1 ' , text ) ## [text]
207+ text = re .sub (r'[\[\]]' , ' ' , text ) ## unmatched []
208+ text = re .sub (r'(\*\*|__)(.*?)\1' , r'\2' , text ) ## **bold** __bold__
209+ text = re .sub (r'(\*|_)(.*?)\1' , r'\2' , text ) ## *italic* _italic_
210+ text = re .sub (r'^#{1,6}\s+' , '' , text , flags = re .MULTILINE ) ## # headers
211+ text = re .sub (r'^>\s+' , '' , text , flags = re .MULTILINE ) ## > blockquotes
212+ text = re .sub (r'^[\s]*[-\*\+]\s+' , '' , text , flags = re .MULTILINE ) ## - * + lists
213+ text = re .sub (r'^[\s]*\d+\.\s+' , '' , text , flags = re .MULTILINE ) ## 1. numbered lists
214+ text = re .sub (r'^[\*\-_]{3,}\s*$' , '' , text , flags = re .MULTILINE ) ## --- *** ___ rules
215+ text = re .sub (r'<[^>]*>' , '' , text ) ## <html> tags
216+
217+ #Remove non-ASCII (emojis, special Unicode)
218+ text = re .sub (r'[^\x20-\x7E\n\r\t]' , '' , text )
219+
220+ #Replace special sequences
221+ text = re .sub (r'-->' , ', ' , text ) ## -->
222+ text = re .sub (r'<--' , ', ' , text ) ## <--
223+ text = re .sub (r'=>' , ', ' , text ) ## =>
224+ text = re .sub (r'::' , ' ' , text ) ## ::
225+
226+ #Normalize whitespace
227+ text = re .sub (r'\s+' , ' ' , text )
228+ text = text .strip ()
229+
230+ return text
0 commit comments