From 44c57349f50379fe0d5ab38e0838218a735c38ed Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Wed, 9 Jul 2025 14:53:24 +0800 Subject: [PATCH] =?UTF-8?q?=E6=AD=A3=E7=A1=AE=E4=BF=AE=E6=94=B9=E4=BA=86?= =?UTF-8?q?=E4=B8=8B=E9=87=87=E6=96=B9=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- utils/__pycache__/prompts.cpython-312.pyc | Bin 22893 -> 25358 bytes utils/prompts.py | 66 ++++++++++++++++------ 2 files changed, 50 insertions(+), 16 deletions(-) diff --git a/utils/__pycache__/prompts.cpython-312.pyc b/utils/__pycache__/prompts.cpython-312.pyc index 03ca2c1a882eaf0345109901c3860d85c6767bb9..8a9ffe9cba0509e63d4791ca1e9b77f3c216224a 100644 GIT binary patch delta 4581 zcma)92~bo=8h$-9ZZUIpQO9dpz0D|DkAtK=o0s?LxgJOJ05{*aJ zibuzYQ7c+WHYAdWk|%ag1 z@4vhM?*6*x??=$^OrpHLtxJYI5VAM7h<6)!g zrHp31q|%GgGjbcvc-!QR4=KSSW3=n(N;RV+bzhstMjJ^yV)1rKq@^Ee>9dLIK$Y53 z%3|pMNcFPDTS>gc zqp$7%+v(G{-`sfXRItmSjd+>OjypnUD%n*~l{DIIE%;JsE%Lxq!pdic zgzwLHgd}duyB?I>#~f=s+{icEugz}fnTEmSLFPiknTDa&eC6Wd+-iRAlFM^yF3)Zs zPFv{+Sjp>F;xEIJz9=&K9QQOqRu1)X3G?36*3~9)#?E= z>1^8L9P%XcIh8y|DL&^HlOq+Mrx20POKE~mRa<P zwNquQsBp?2=1xKc#ZK*%+x^`2rbuWT^+2^FK7ZI^9S$qlX1!sKf4}oUXJ7finqjk**G1gW z`R}rBx9*SbNj;SQL3&RtKecdZ`A{`qS#@P`4ZpNucyS|N)xIp@kWu2NdO7B$`Bp+_l-A|Az`Yhtt+N0@m`nwS)>Ib)m>RdW<6QuX{d( z71xfl;>>xuWZpZpIT_S@emODXlUboeJQo<4lcqQ~T}I^dNDml8=cIU?PxJ&iLq0cM z{JA7%ZmRh61O##_JvSY12oG0!l4x*1??*`Sclw}r9N|bQs-8w}8xJ|f5nfN%;jxHJ zoD>-%zQrl-_j z<>UQ7eEc{Cp*M%l-;Z1Xv>jML_!yt{6fuQ^2``QE4e(^YA|bTP$<4ts_Q(l8o!yML zM9tcMK#PBfQXwzsQ|>E_%a2pxICDS%VXpD(j-)?(M|^J`kG<|Hg)yp|I|W=w#<-FjHzh~tSt^S z$YV;$Lqd2=NCk?Ha|a<59o?y!6f_ZnrU!5yWOV~M4J^W5p!sNL$n0Z>~<0#udqZcVYd>=#iF+cJ{&!@6PWtIaTiCo5F6VH4n722kCyz> z`9%}#OrA`HohK-Mbi=DZts5tL_H|;zpIS^byA7nPmTTx1?u|+90)D@<=i|}SJ5ABBXa8786qlLqZ@zYEvJ?IQ}#MlE4d-3DgfOMd9r;N4L*TBP%eGV+btcfG=IlFIhfpZtnKHZipN)#5oLcJ@&qN$BGUY z@hRDZX+v>C!F+uA6+^`hL-5fNeVv23LzzPf z{FI8Tft7b@X!bxxk$ijY_UbY8Mfdsh83nvKy6}pj=<%2p!TOJtAxh)7bjD>MMO0#P@J!>Q?l)|LfF#gmn0&w1gyh9kERSb^tgUPP(2vKg5jr|{Aa#r)1z)_-RaREthx*;PJVeLzh{q@>2Xb8H(0H9+ zSbkqHjFpM{PfXKD&|Z8oqYu49WX8$=+ILH~`l?L!JfN;>ryNH@CytCNDpacj(_|EJsbhtle&OIdk z1k?jKEPJWua}Y}az5uv@cVwrcTznyW9Xi@?&ABhud0H*+izW+ibr=-=^VQ0^Q61dGNwRJSM z+1uDnxIRD3EJRef*y$+vKtE2eHv8&kn?O6QwJkRGG(ME?D;2VM9)Fb|g1*H+=NF)o z{%KZ|DCC(r`UZjlWZ{*CIp~}IlZB(m=Mqs^JlSk8!d<{R@R9jhzfG5BVo7 zbX_VA7MmemPP(a`S!QD)VYxp^pt7v^Sy>NU*y8*zMD2D*+L%jQRJBa6m5!SWq} zBG(U)8&Llq;0J)e;_u7+m$>!?)FA;D{WEjCo|v2xrm@cM`lTQ|@P)l1Wd9~K7X~9_ zAF|E<2q2_-s*q}>E7d=PdLrNKFHjYRa0?`X{Zl~t7WXckCw+xXqiq4sv|9Qj{?LV< zo$%Y>4`9b<7Zsq}SY2*FJ8^V*0(u26Dqp084J)h&dmA4w4?>UdNO_gIRE%`pUJocJ MA2dnvoQk;r0PF7^xBvhE delta 2881 zcmaKu3s6+o8Gz5Z`)1i?VRu;`!Y;5p77Z9ctAYl32^OlnG+>o1+zT$kB6lwpo84g4 z7Ht{D9Ak`+i5Zh-jW(j2L^Nt3G^v@iaS;@{AyeB*)J_|Tk{O$sW_tdM*v!=Fo#8wG z>zx1G|NiH&dp^PcJd3rTX*5BIKHn;x8?WY`(S8w(FO%1nn+Vo8@{O#Dld&P3iq+9k z!O0qxZ0Jr|t%ePwx}MW=vPe3YdO~KTMmcNPDXrD*LY=rav=Z6EeS?}8b!6UR6^FS*BS_GY~x$)g21tLT&KNta|`DpH{$GAMHapisW3KeoI zh^5p(zMq?;Y(gHjJE&Pt%9liI>ID^@B0kMZYKaPRt2WWk!l${_?EFkf8nX1m3shLr0=#4YK>l%axRcA8r~yE(d5#SiSkKd^@# zRn2Pj6&QCgt%$k+tfPFAD=9BFnhov{+P2c! z5R7^jN>MNF*(OD7RP$UCYBrEB!<4C&v}?uAm2W`_NJyI9rxo!j9-Z*$1R&IGgHy5C z=vL5uhr4yKIBPl;G5gjS)gU&~qkBO|lMKyBTksY%b2@C%zQ50xV##;dMjkwEjsFg( zcR)W_ZuP%CdGY1j`@MHxKi{FGmf2%KIx;NeP{s}X32D#FTBoM2$RE0Sy2az)=Jwt@h4<#NiYIa&a2RcY|*=pb;=1(1ggPtPpk=$l^9OQYoNobhOX} z^sLhxq|wpj;(q~VCC@e1!EL~=fcTdH4nRA2RrZ#aIy#Ya1vJi%);giB?onbMVq7F? zL7n1ND!NWEq-R0DYL03h-c>xsM3b_G$z3-s@x7XH{hTqqby#m5)hA7uVqWp=^N=oY z%r(=(@rcB+hi^Sm{q5kH!Uh<#1{s`}VQZ>P7}zo^n*W%pOs`|BJ0R`z-XckBPV z$BnUL#>8P`;;1os!jd$cRNZIm8q3HZ&d49hC>%|y9RUEYHk?^Ll3p>AQh87_k&<>=b3)VS7>F53Sw;0nX*czy^hNoz*Z9}6{xzJR z+dSGXOd<3$oXB9Fuzx(;HgG)z3Y<{KfU-#EyiADQxdNuJb z%8o8JOFs%LTwH8ru2@wiM%k68VoG$fPj#{qqwo7g-O{-`;NwEai5<-E0ckZLnZl;x z;b!9x1CEfsrF-$1FL#*_<1i9xk0RP!gLNDDngIep?9xLZ$pJFJaZ-_+iATvVbN42G zW6hDJdL>hc4*dneh<(OxB$3FR+kvnkE8fOvbiJ=>xi%9H|hH!ARDj{@CM*b3Kv}YxqKApZ_VQ0 zi0oRKhu`vjw(>rfzYXSh$WZBXx(`4s2V4YPBAI1b*h)5*?ZBse|0sJP(VYbQ7Qk-- zXUV9$6fU^KG_q$XINS$1# zz@G;r+$8}y*XeKxE`A45RK+KYD{5xG-zkWJJOQms*xJI0=iF&;<#-=iU1gMuRlGnp zRhjWs^6RQnobDU1vM}Z+)@TS816YV@Z2=zg{dDal4*QrIY{7g!IN{eB|9etdllQ;n zdb1`IuOWY`IY7&`xi(gk4CM;QnmgDgPApi_T`5f83J~AT0@5EyZ&W1te0?_loCIxH zmFT1&jrW6RnBpgVJc!o-BY;uBb@IZ7s0}lH0Xh`GW_n_cyQnK5WgQIyPk&Y*emg+y z6|wp^KovJ5R$sii$G|3*dVyGK^-QU6g86a1`AJa4Ed)RkuiH$Ju9C9468UrVHCz?s z%PO1iV9a@RUh=6u4SUE#`&zZwwc=-)Ahq=|_-oQxzb>Sm T!G?p{X$0eRqm2Bue%^lo&zwK1 diff --git a/utils/prompts.py b/utils/prompts.py index 8dab0a1..69dc7ba 100644 --- a/utils/prompts.py +++ b/utils/prompts.py @@ -140,8 +140,30 @@ class BasePromptBuilder(PromptTemplate): data = json.load(f) if "examples" in data and isinstance(data["examples"], list): - formatted_examples = [f"- {item.get('content', '')}" for item in data["examples"]] - return f"参考标题列表:\n" + "\n".join(formatted_examples) + examples = data["examples"] + return f"参考标题列表:\n" + "\n".join([f"- {item.get('content', '')}" for item in examples]) + else: + return json.dumps(data, ensure_ascii=False, indent=2) + except Exception as e: + logger.error(f"解析或格式化JSON文件 '{path}' 失败: {e}") + return f"加载文件 '{path.name}' 失败。" + else: + return path.read_text('utf-8') + + def _load_and_format_content_with_sampling(self, path: Path, sampling_rate: float) -> str: + """根据文件类型加载和格式化内容,并应用采样率""" + if path.suffix == '.json': + try: + with path.open('r', encoding='utf-8') as f: + data = json.load(f) + + if "examples" in data and isinstance(data["examples"], list): + examples = data["examples"] + # 应用采样率 + sample_size = max(1, int(len(examples) * sampling_rate)) + sampled_examples = random.sample(examples, sample_size) + logger.info(f"文件 '{path.name}' 中的examples采样: {sample_size}/{len(examples)} (采样率: {sampling_rate:.2f})") + return f"参考标题列表:\n" + "\n".join([f"- {item.get('content', '')}" for item in sampled_examples]) else: return json.dumps(data, ensure_ascii=False, indent=2) except Exception as e: @@ -187,25 +209,37 @@ class BasePromptBuilder(PromptTemplate): full_path = self._get_full_path(path_str) - files_to_read = [] + # 简化逻辑:对于单个文件,直接应用采样率决定是否加载 if full_path.is_file(): - if random.random() < sampling_rate: - files_to_read.append(full_path) - logger.info(f"文件 '{path_str}' 采样成功 (采样率: {sampling_rate})") + # 对于JSON文件,对内容进行采样 + if full_path.suffix == '.json': + file_content = self._load_and_format_content_with_sampling(full_path, sampling_rate) + content_parts.append(f"--- {full_path.name} ---\n{file_content}") + logger.info(f"加载JSON文件 '{path_str}' 并应用内部采样") + # 对于其他文件,根据采样率决定是否完全加载 + elif random.random() < sampling_rate: + file_content = self._load_and_format_content(full_path) + content_parts.append(f"--- {full_path.name} ---\n{file_content}") + logger.info(f"文件 '{path_str}' 采样成功 (采样率: {sampling_rate:.2f})") else: - logger.info(f"文件 '{path_str}' 采样失败 (采样率: {sampling_rate})") + logger.info(f"文件 '{path_str}' 采样失败 (采样率: {sampling_rate:.2f})") + # 对于目录,直接选择指定比例的文件 elif full_path.is_dir(): all_files = sorted(p for p in full_path.iterdir() if p.is_file()) - if sampling_rate < 1.0: - num_to_sample = max(1, int(len(all_files) * sampling_rate)) - files_to_read = random.sample(all_files, num_to_sample) - logger.info(f"对目录 '{path_str}' 进行采样 (采样率: {sampling_rate}),选取 {len(files_to_read)}/{len(all_files)} 个文件。") + if all_files: + if sampling_rate < 1.0: + sample_size = max(1, int(len(all_files) * sampling_rate)) + files_to_read = random.sample(all_files, sample_size) + logger.info(f"目录 '{path_str}' 采样: {sample_size}/{len(all_files)} 个文件 (采样率: {sampling_rate:.2f})") + else: + files_to_read = all_files + logger.info(f"目录 '{path_str}' 全部加载: {len(all_files)} 个文件") + + for f_path in files_to_read: + file_content = self._load_and_format_content(f_path) + content_parts.append(f"--- {f_path.name} ---\n{file_content}") else: - files_to_read = all_files - - for f_path in files_to_read: - file_content = self._load_and_format_content(f_path) - content_parts.append(f"--- {f_path.name} ---\n{file_content}") + logger.warning(f"目录 '{path_str}' 中没有文件") except Exception as e: logger.error(f"加载Refer资源 '{ref_item}' 失败: {e}", exc_info=True)