From b7c5f92e899ad3a77357f6b30dd799b7b1270abd Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Sun, 18 May 2025 23:58:56 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E4=BA=86=E5=90=9E=E6=8D=A2?= =?UTF-8?q?=E8=A1=8C=E7=9A=84=E9=97=AE=E9=A2=98?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../content_judger.cpython-312.pyc | Bin 41504 -> 41684 bytes utils/content_judger.py | 68 ++++++++++++------ 2 files changed, 45 insertions(+), 23 deletions(-) diff --git a/utils/__pycache__/content_judger.cpython-312.pyc b/utils/__pycache__/content_judger.cpython-312.pyc index 6776532b903477593c30803cd86e2e42b0aa53c0..3b99321455cbbcb53b41e1457957b5e276888c98 100644 GIT binary patch delta 2428 zcmbtUdr(x@89(RVy}NrayUSxA?7}X)n`O~m#O0M>-LzX?@=%f>MoVi^ z%VOGCSoFdeN_ZQ$Sfm@8_8{TJE2bae-H*Yt6sWabX&q@n!;9m4CQ|1TO3uiZ73k{| z4OBaGuG4~Q3b?VpmNp1KLzO2J(Z-WH_)Dy1zyNAF*#UQ;_|{B0A0u){fRKRlTc6{+ z9It%3O+?39YgTd}wyx&poFciCr!o~dS4nb37Tuw2z ze2Rf_kEosXLLm;qitq?3k+SVIIEotEe$VC7WVVQ;7>;de55Vk*6jleA}h&>zlh zeIW;*Me*6g6Huv$rjr1GL?~U$IFZw5~*c319~P!U+$)7+{|V$_WD~7X~>Q3 zc;|=&UPGUC{0SzFeR8e{QhV@8q{C&juG0Y*QGb^iHFd_yd0J#1lYaGWosP%XVu8%()Z=HfoUmywYlDj3tbdPn3l!qWs_T# zh=}j`ivBkVh^A;nif8$RQS}HuRPuRaFqW^+o3t@elo$q81#BP+=evB&Glx<1O8ThP zq-x;%^D(*_9ECQ}sC!ZzfK(uCN2f1XTa+T*3wV{tVj0`Vtmg5Qml5TdN*e~li5#J_ zLC?p1qbC=QuA`1o2a_;`QcNYj)D6&QtfwIGnMFqAjw^8m#%GrRyjz*Q8u0O6_5_>i zhrSPqNgQ3MUs=mvr#Xf(8xpy7uhITeDX!6j*1dcg^56#ZcnB0Kjt&= zCUPhKw;F`ZRR$d?O@Bv2-MiWFIp33K62TsnC%lj02m%p5CxW~y!Wa=BVR4-1l9z(zDd6{`&!6uZ{>tY1_vjAkc=t5<^8ECTCs$Pc;;ZM$n&vb=- z7s7A0hlk!oUmiBqHCc&c==6Pmd$_$V{Ig%-w>x~c<9_GBz0+sILl+)&_Iz>qvLjyD zj^l(KM7&A_dBcSfB7TC!qO|t#;l1^|P)#gVL~J8w$$>-r>rvZ9GgP7XFMdB`5!MO4 z#Gz=YeQocqxY?_wBE*eIsgVP~jkSwMvZ)eR-1zfAQcftIb~)vN)H3N}JOyxsz-TY?w_>4;k$X#&xsCb#cSn zJ~w9HHdz9i-s)lXySvcGeL2M@92~KLHI7UB2KNo`pG(`|e>o!Mw7IwS#sRK}3wS29 zpX*CPjt$*x!1k^xq5)~?{<7}6b1y}H0P#qX-B63R_S?8rpfLp%#xEngP)q-FtOgkE zOF*X0MI-%|7W*0u-kP}tC_`p*;Ct+HwayV}n?qaWLiUQBj~3PNJKEG`Jf*47`+3 p0vY|%R!YTud&y#8^8Wfl4XPL{gEOdW@OW$i2USaeNFfD}_D==~$$bC- delta 2230 zcma)7dr(tX8b9YglH8Dl5CY*5LQ)_a5D4IlE|LZi2#im(T`jC_xz+_8ac_7PE{Z5F zY}qYvED6RyaJ1t1C@niJYj>vDf*a~~3ynK43^VH=jmUJnjp**S`p=$=MmszE$DWz{ zJKs6q<9xq!zI)E?Ny0NhC~nE+Qh;0Tzf8Y0yFXB*Hxro`7N{h--{9$a(q8ul_d6`1jHBCgy0Op-9*RJCgu~cloQ}>Jdt*G& zrz+|Y2k>N!BfTfxlQ>NKRB5xS^)ssVBei3@eX9Hp@k0?BIBJJ6Xvb*-`bC~}bknhR z2q)0}Rt=hOjf7XRlnVP${P7;R1$}VbO!3lcQefEI7J~kE!hklm?S$u6?9a7D!narK zYtdp`4@^OCoiLBWlN3~Q7jT9vgt&`>^q!D;MEjfY8sgi(V!kDW`~y_Z=DmU!1q?|j*3e8pLWR&~sx%+8(bGI5Z$Jf$rjd={R!4tJAG zel+g93E3E@;-G->Oo^17S{+qFlV8wuiFEXO=RZgYM=$-ho`N@!zV}-gHY!5}P&PmT z-~bG$!pDJGM55Ns5~=`He2I>pHo;Led^#=d0B8~MG@dNQhbFEK!5OSHUVIroky!Fz z0VbHtfLb{^zE7*>#VmD&zKj_P4FXT%b?ouPDql zm?Obbtny0OvwD%K$R`(HyDdlM1ASAuq)VY8b;UoCGmn^VghGYZQaHxyQI9L{d(7=}Ww2xAjbUN=vg z*EVvv5W(|54}EuL+p)^0Wr{Tb;DQHP_GC_Ro z4u4nQUB^fMUteE3-T$H<)w$9)nZmf2u`BnA0J{Yc?qlw%00MKaOn_Y&=0&d`IkNvS zJ5N>DR2_bDS+}2!hau>~*}YQEg-z_j1%lGuMM-M}*1Iv_YfxPT8O5Ebm=4<=UGl8X zJfk!FbeYQ_3Ax;{#uvx(#%Z6Su)8E60fvNb+nhdm8H7Y+IjMzMowL@J?XjP+Ak#or zl?KlSV!$fX#REeJhO2zWjosVudk_`ttomBBYIyU=zL_-zbIIAW$<~=<>v)n^?MvS3 z3=N1x5fyMDN;@d+mwGJY5r2&;nlo*5GM 0: content = text[quote_pos+1:content_end].replace('\\"', '"') - # 特殊处理换行符 - content = content.replace('\\n', '\n').replace('\\r', '\r') + # 处理反斜杠转义的换行符,如果字符串中有'\n',将其转换为实际换行符 + # 但如果已经是实际的换行符,则保留 + if '\\n' in content: + content = content.replace('\\n', '\n') + if '\\r' in content: + content = content.replace('\\r', '\r') result['content'] = content.strip() # 查找analysis字段 @@ -503,6 +507,11 @@ class ContentJudger: if analysis_end > 0: analysis = text[quote_pos+1:analysis_end].replace('\\"', '"') + # 处理反斜杠转义的换行符 + if '\\n' in analysis: + analysis = analysis.replace('\\n', '\n') + if '\\r' in analysis: + analysis = analysis.replace('\\r', '\r') result['analysis'] = analysis.strip() return result if 'title' in result and 'content' in result else None @@ -555,7 +564,7 @@ class ContentJudger: def _prepare_content_for_serialization(self, content_dict): """ - 对内容进行处理,确保可以安全序列化为JSON,同时保留emoji字符 + 对内容进行处理,确保可以安全序列化为JSON,同时保留emoji字符和换行符 Args: content_dict: 内容字典 @@ -570,14 +579,22 @@ class ContentJudger: for key, value in content_dict.items(): # 处理字符串类型的值 if isinstance(value, str): - # 第一步:彻底清理所有控制字符 - safe_value = re.sub(r'[\x00-\x1F\x7F]', '', value) + # 第一步:清理控制字符,但保留换行符、回车和制表符 + safe_value = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', value) - # 第二步:将emoji字符转换为相应的Unicode转义序列 - # 这样能确保JSON序列化安全,同时保留emoji语义 + # 确保文本中的反斜杠换行符(如\\n)被转换为实际换行符 + if '\\n' in safe_value: + safe_value = safe_value.replace('\\n', '\n') + if '\\r' in safe_value: + safe_value = safe_value.replace('\\r', '\r') + + # 第二步:将emoji字符和其他非ASCII字符转换为相应的Unicode转义序列 char_list = [] for char in safe_value: - if ord(char) > 127: # 非ASCII字符 + # 保留常见的控制字符(换行符、回车、制表符) + if char in '\n\r\t': + char_list.append(char) + elif ord(char) > 127: # 非ASCII字符 # 尝试保留高位字符(包括emoji) try: # 验证这个字符是否可以安全序列化 @@ -591,18 +608,18 @@ class ContentJudger: processed_value = ''.join(char_list) - # 对于内容字段,特别注意保存换行符 - if key == "content" and '\\n' in processed_value: - processed_value = processed_value.replace('\\n', '\n') - # 最终验证这个值是否可以安全序列化 try: json.dumps(processed_value, ensure_ascii=False) safe_dict[key] = processed_value except Exception as e: logging.warning(f"处理后的'{key}'值仍无法序列化: {e},将进行更严格处理") - # 更严格的处理:只保留ASCII字符 - safe_dict[key] = ''.join(c for c in processed_value if ord(c) < 128) + # 更严格的处理:保留ASCII字符和基本控制字符 + safe_value = '' + for c in processed_value: + if c in '\n\r\t' or (32 <= ord(c) < 127): + safe_value += c + safe_dict[key] = safe_value else: safe_dict[key] = value @@ -615,10 +632,16 @@ class ContentJudger: json.loads(json_str) except Exception as e: logging.error(f"最终字典序列化验证失败: {e}") - # 如果依然失败,返回一个绝对安全的结果 + # 如果依然失败,返回一个绝对安全的结果,但保留换行符 + safe_content = '' + original_content = content_dict.get("content", "内容包含无法安全序列化的字符") + for c in original_content: + if c in '\n\r\t' or (32 <= ord(c) < 127): + safe_content += c + return { "title": re.sub(r'[^\x20-\x7E]', '', content_dict.get("title", "序列化处理失败")), - "content": re.sub(r'[^\x20-\x7E]', '', "内容包含无法安全序列化的字符,已移除所有非ASCII字符"), + "content": safe_content, "judge_success": content_dict.get("judge_success", False), "error": True, "raw_result": str(e) @@ -626,11 +649,10 @@ class ContentJudger: return safe_dict except Exception as e: - logging.error(f"处理内容以确保安全序列化时出错: {e}") - # 如果处理失败,返回一个基本的安全字典 + logging.error(f"序列化准备过程中发生意外错误: {e}") return { "title": "序列化处理失败", - "content": "内容包含无法安全序列化的字符", + "content": "处理内容时发生意外错误", "judge_success": False, "error": True, "raw_result": str(e)