From a0f66a4a496b9ae48c4abe97b5344f5efab275aa Mon Sep 17 00:00:00 2001 From: jinye_huang Date: Mon, 14 Jul 2025 15:57:09 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=9D=E5=A7=8B=E5=8C=96=E4=BA=86=E6=96=87?= =?UTF-8?q?=E6=A1=A3=E5=A4=84=E7=90=86=E6=A8=A1=E5=9D=97?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- document/__init__.py | 20 + document/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 585 bytes .../content_integrator.cpython-312.pyc | Bin 0 -> 6153 bytes .../content_transformer.cpython-312.pyc | Bin 0 -> 8852 bytes .../text_extractor.cpython-312.pyc | Bin 0 -> 16783 bytes .../__pycache__/web_search.cpython-312.pyc | Bin 0 -> 15500 bytes document/content_integrator.py | 130 +++++++ document/content_transformer.py | 236 ++++++++++++ document/text_extractor.py | 356 ++++++++++++++++++ 9 files changed, 742 insertions(+) create mode 100644 document/__init__.py create mode 100644 document/__pycache__/__init__.cpython-312.pyc create mode 100644 document/__pycache__/content_integrator.cpython-312.pyc create mode 100644 document/__pycache__/content_transformer.cpython-312.pyc create mode 100644 document/__pycache__/text_extractor.cpython-312.pyc create mode 100644 document/__pycache__/web_search.cpython-312.pyc create mode 100644 document/content_integrator.py create mode 100644 document/content_transformer.py create mode 100644 document/text_extractor.py diff --git a/document/__init__.py b/document/__init__.py new file mode 100644 index 0000000..47dbe92 --- /dev/null +++ b/document/__init__.py @@ -0,0 +1,20 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +文档处理模块 +提供文档文本提取、内容整合、网络搜索和内容转换功能 +""" + +from .text_extractor import TextExtractor, ExtractedDocument +from .content_integrator import ContentIntegrator, IntegratedContent +from .content_transformer import ContentTransformer, TransformedContent + +__all__ = [ + 'TextExtractor', + 'ExtractedDocument', + 'ContentIntegrator', + 'IntegratedContent', + 'ContentTransformer', + 'TransformedContent' +] \ No newline at end of file diff --git a/document/__pycache__/__init__.cpython-312.pyc b/document/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c005cd53bbb98cb08b96e947aa048c9a8eed5bfd GIT binary patch literal 585 zcmYLFO)mpc6n$^Ljg+*u{y;b84b-kXX5|pEsHF?wNb;z308>Sj{Kd=$~Sr+qsZh_fl*K1v4oVntz z78q%~ghgQ00w-i>aWyc8sqf@Hca+7UUp(g0Ai$@LG5#`Sto)h?kG!CX7c}*)=)Kaq I&iX;{4}oCIFaQ7m literal 0 HcmV?d00001 diff --git a/document/__pycache__/content_integrator.cpython-312.pyc b/document/__pycache__/content_integrator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ed99577b629c13eddd8ff1578725d95b8635ba1f GIT binary patch literal 6153 zcmb7IeQ*=U72ng{Nhevhgujt(EPwE!7#lax5DYj0+yY6)3^AEZ6o@Kxha=F3b9YK& zSI&?QB%+XFI!P=-(?FX^l#&M1ri>|>hV(CH`j3R1T0O{=w8+3;abjjN?O%QGPAAK@ zGBi7LZ+G9m-MxLg@AuyB-%3j@1k#@_$j`pIh>(9{MNe$5F!LT1MhQ=NDnQ(r8v<0& z;5GzlHyvc$OptZ66!PhSF=%p|)G`y`f}ER!GRqqS=AgxGQTbG$#9d+_0@+`yPc)kg zrDrLv<}MRV2C|j#rbC41&KflC*_^1`rnZ@(&7!xt?Squ7bsBk+5WPdp14Y(-9%zI_8FW{9VsBlmbWPeb&gL4C8(T`q~MXyg5_@1yY5)?wR zFE_86(YeZu424lxiJRhy+wcVF$4v{2z%B;9VB`%OFgHPo&X*Xq#0h4>w9k-}V^y9R zXe|QVtSRZ+x5`iDgHAaqF!^ z*;fu{Ke`A@&88;v>1SowP<-C~=7oKt2AfRoEgdv0U zAnjsQXTO+TCxDXNN>2Z@XBFvmA^!iI9bf$XoXz)aEvf7oZcBQM^fYY{WvaLJS);-y_In}oL^6D#8y6urGJ5z1F zBg~||C1r0pz2&NX#n&vUT{g`Ud&>;W0ct5xIFBdf4GJEQV)1x_VLk$?@hDc0=edYC zkZUn{Jg`iUM?{5)2w95FX2AaB3ZdKWm}eJmSrky(o9Xp(U4{uGL^OQinuRcd!jeZG{cJ1%;HQ{ZJMdg z*egf+PMD!S!x-5elr{$r?}IO&^6(h~)+|gQNbm+QWQL~&RxqN0^9;ZN!?TANH#diC z2CfmfmIYiBa7%aq66ninFafe=;950+$ej{>yTf9n9cI!%AqNvD6V7&LuKey>H^+ZB z_2#QnXMPLVar4tNQ!k&JW6QWM)s|7kqht_(vR7ORWfvtPl1pg#PF2;orMctgSnwNg z+_W|)c0wf`BpJ@CDN(f!qbSG`G2|;m0kk1?_QzowKoUwSm-9%|56H_?e(*~?-A_Jb z&gRgdkN62#KLyya2C##U(#4QvzrIm0g`V=*!9)$kuqDbA-~_AUgdwV8`>_WY#EfFt zLg~J3n3YTPnWAi-YlMENfO#o8+s_0Nwt3A^Ef|BF)k>5qm?b|()EG5H*{CVX73#@l zjGTjk7M=wQppTmJg-v3=)RnJcn+`xymvid)NcQ-faDcRvl#PwvK9iUuoUc5tZt}dL z^wXO6YB1yimzEzwg;#|{@c2#r;mp*pj==mk|8gofx$6I1CKYnUGLR8bcNz>stV2Zf zm|_JI+%6ZVK?EY8SdF9xh++q~r~1FV`=h~8DvLb*LP$Ud?9shog$E!T;|9;HJ9L>< z6(=>>$H*7HBLG@Qx0r(tdH4%Q|*|nYEMsEACGo*i-I=ELOr0y!(4MJp%hWW%f7&hZpJt1S8*`w7gJr5vaUTH4 zDCq|XC^U*$!v$FQhLGJLHB1kqp<}Oki8n;)QxMojnPOoFv@mby4VWU|F$N30jq?VE z>M8QJ;cO1XzO%0p`r?Y_C79XtHhs_D9Q0NL{JGu#ubIvF9Nls+378$-;=FXPUi`gh zF8MxV9J|X9-dYqm^ZM^2u2La74r`F;eGITkI7$cS2zqeF_IDy+j7yTxcAD6DHlO1 z%>J-H1Q9C&s}`|x`XxZ2{31e_>bX^b(xNL2E{2*Hg3)~R+=+ev1q9;DimDiQv%2xL zIq`$PSGOPeacome%2+Fp?>Mp}ejw45s7P#1Navy-L=&O$o@D)|3F~IPIh5!-x95XB zi5UJxIQ^)n+P|->7xON;6#5 zBv+r}>f^&7_Kht*`%H4>hVe&}4cjKT?ek6Y`ugs1>C>NH{AqgqqgQ-a*YAw`-Z^mc zK%zCVI=N)k*wU*@)+V2HPd>Rf_2k~zj*NBjq}7?SI!|vpz54XtM8&!K59$+TV|&Ld zK5e+zFmAgnU3n<^$kWND-Je_cK(GLo!|9gOkhU#Pa?7>L04i^eXGa~V=2NSikD;IR zky-CvEWGF8$v`)XVFO$U=%@jV!v$WSDKH&IKVZD!z;4hOb*vn&3q_t4AoQp8@puMJ zAqvp}KF=I$V~Cs&{GoR~$cM&U!i~TLcRj)zKQx`q890~?%X7Th0S_I=nd3$uTx5KP zJ#Z-1_=FiYMvXj!#tDCBjWQ<;$7~Go(Zj}JrVk{kT=@m!vP3#z0xc5H9{%H<*FU;B ze(BC{;$M9dyY}zk}+mOn?eeN|)?5j_X4B4OF9a$rEcDC=1tlPJ5-=4NRc>kJx z!I*Oy#3l@!XVAUl!wyg3u>~(`=vD0`cQaI|7w6myHw3)Fy}Y-3Xl;>`yD)n0*boT& zyaA~j3R=gPVHOD?tpeO6%mGQ{<6PI^hdg_-crwnU= ztrBkTXPu>+jmFQq8KAwcQgI3J#6~1eBuz-bmdNbOjCPBL*eOp}#g zkftw4!xyCe8ZqB6J7P7HW=G2Gh(DM%w;nX!pe)}S?9>hl)B|*eCWew}mGNz>KvCPM KZ-|5*3Kk#-tRY1fkNzt*g?-IG22qu+P$ zj9!*Q@^|M*ckbNpzVG+@UiaUMig*T|(U0X<^wkXWJN(cdgPNK7cgPGg0%Kt|by^LjajGI01h&G}0+g*N*rC;_1ZnKVgmJte$GD6`s z9WA}4R<$oAv?8DtYczXBkIq&S^gtCpaqP{+jZY>&y`DUAC2@WvIdVDi-Y`G@>1PvX zMz(IoM8PO4%OK>6bbKqAxYoU>OkAeLnRz>=xZfr8FRVx;(H64i{aMQYxn# zJ7o7ESt-DtoU%ji>2@m>DY6rH&*5{+PTV%7I7M&^4yW8<;}x^R;dlDn4u`@!9KL`M z^kTly;rMaT=~XEvheHUs91an2pp-ZqPQO1OlfEUWte0d_#N84(Ow5pc#LPg-{JFJ7 z3N)Ct^CgWYUlER0Yt7~B@Y=mvno`5TcgnB9D*S+GEz0_cGUdM3@- zX#~(2?1gjC3V~KM2dxNb#dFY#fmSjHtpsSLFt2%ZhOmOtqny1=fN#i`L4H1fdx27> zo(CL`%7|j{Nmx=Qd;#y)CM!TZ`OdErrv_6zNx2lkJv5NGae*SZLovcp0=!(Rgn`Z& z#t468j+t|i3^Tp#Ay(3{jECuE98f}^Ugkh%n{+91RL%o+vCo_bN|Et$9w;TUJG)q5 z-!;JKsa|>*o36uVQ06;j;6WF`j3hh#f>RWfqHYmT;gTJa)9aR$LZ|_}?C1)51h-;& z*&FC`bO$6^F-bw6&nfoUOrjNrQ|iEsO;W%71n?ag2;u5GTZHA6~%0Z7Jb)b6LATfDL+R=Flpxh7sw6{}bo zsaW}-z_^4TEPg@DEE1d6^mu8M;?Lg)m>!X(`jn*2rH-hJYdplrQd=syFLpnoNts~BKP|^ z9n;J8vb}musrDIQUTj`AW%IGAT%?WNYtZHh?TfY2^t)Ft=!0NZyld<;%JZ~x7#l}4 z%Ja371LayS{npHY>xOHpF|(Vo8G}4*q^gUE{^zaglgexmyQJ3C7gA>*V5^l1jzoq& z%W9ILwpy)pJSt2ketA3b{##jfv)BM?5Bvf|&-f?DlP6D2d~j>>=83G@sgGTRU2>+n zCnfYWLqQup?^J_}VZ+A9P9)FVOb)$)h77S;=p`_R?;t5{i8wlgeixk}&P_t# zsjNl?(!A6jXghFtX&%Wms}`?)IRJM#3L@L_{i+@{X3pcip z`Jz=@!{$ZPW~O?1eA(LAvS%X8o*|j7k!4%IXlx2Mua7oYmYB!8I{9J z+8z{{8u+le?#nW!bV3j7DRAV2CQV&RrJZe?Y9njpCR80mHb3U~y=ml;ub4~60+ej~v zC#p0h@}vreN?Q%*Z%!QfggOQkAwe(+7{0*@4gyfdKf9cK<0goXiBrdTzM;XIxO6M| z?sZzsH(9Blk$mrOlDAH^T3^rR)4uVsU(gCrbA0Uf?Q+PTyPHcmh zxE>OvG7)M;wLXMl2vGG|O&Zw->x636Ba< zM<6FIhp)q{H_@f0MWxuBZC>QDX`Y9c5+(pTuB4nBG*2&KN*A5{$?>1WtCqy7Rz|8; zf(0<#bGj!~5vyMnsb4j^adc<6er>ekiFnP@Sk0P9&6;?1L#%pLqs6%^KIbLYq)Mq1I7h#z;O>UHd8PQI@EEVI@6yxb3=uW0!s(zWKyVOO<0^3PaOYb z^7!BJluhRZ3~+8H^nQQm0-ulL0gx0pEX?3alG2BTEwc1&9R=4J+XLaPvdakY&CHppA4062w6%!NF2y;o{+ar}vFL z8*A@~w0C?ihIj0YwC{@6?S^!8O=xK-80xxG9B$tgsoNc0uxF6JUs92c91Ry~q*T=B zVC1Vm7#Qmwb_RvWrNRx%%vpFrZ;0UK1IT=P)d3uhr;zi^p*nO2GqxZzjja(Cs`itn&Z=K=-IRGAM-elUI*Sem45`3``WwTns!lZ~|mLH7`+@P_6V0 zv`&I>etxY(SRa%bUBW~3#=xz~%R`CdCqP{!Lt}{>?}3wAKbvJAst0m7npi?vTx^2z z{$EM`#EA=u8&|+en79^Fxjq4V09>C*p{5dW%Sb zl#>1_vxZ3gIi!E8Qhw<#?to5}0@aD~LWFsi@NdLn8y|{>R0!rO)RzuIf`Y${2>ule z7(AZSp0TyDwr!ENZJ)b77b9)EqSoDzjxGy59&(3%c%?erwku-Y9bLF*uwa(lpJf=u zE0*M-1m%AH9#-51V`qT{H;#biH86|5y@{01+|gnNXgHlTxb(zyf@sB$z}x8Kz(@k~ zNznHiAm!fUz>F#ZGfLlQ$S|agf?*Cb4b^YX1LK%73^bEq>@{T>Xr`2bW(EW88>^jO zBdDXooWbA|O7lfo%utT*bk1|#bWNS}ZZO)wa3d={Nb|Zr6Bu#k-~wor_J4>op#u4j zKtc7Dad)CR8|eAa;)&arCU0C$dp_@sP7R!#?fIlCXn~2erH2;-QXf>Hn4Ws)v&5~d z(1T0$XJ}BNFY&j-6W6a&K|-N9r{9BaS%#j=!8lTH3}N(#hL7?%S44X$?1~_o6aaf0 zLAtmdz5uRhy=8y3ImOzIkbPBQ zUxS+_xbq;x5-}mZ08J%iEwl{DL!BkOnK-=Sp%OxM82<%(`4o~&!*D_6@RrkC#+Jug zcSKru#9Cj7w7zh+b?@EUeUa9^zg#+6Ke{tiADSQ95ORigk8l_MGTgfN*R}hi<@P~K z9IQ$dEoW^dn zJ2|{X%aX{c4MJHxceJ{fMQ)W-Tb6I5AcoCm#;{oi?soJ<+9W@|o%rieI!a0+mca_> zGA{*VlzOsvBHjXtt&l(g!G{YAo`OWea|Pa4BXdpxVkI;Y2x%S-a5%Z&8Xkoafkyl6dhs)c3V{RwAX=6KVIuRRn*vdterXt=8intze z5heyqaDE~NYw2BNGv-hM(x~JH%q_T@e+$h!_@8h>I^{;>HBsZZb55c^!UY zf`7>hEOnImEw||bXEyNX4vlalyGI*F_I^|xsare4K-u&nrs=!!C)yvFmK*qZ)B0&W zyzf6^n>N53B8#?_qw_y1nno&oZDhe^!~BC?>>>jne)9X%48C7iBS_BiR6uUqY<2CB3? zHO=6A=f1Fge`M!N^y`3c21_5TuLTm!oq_k~hr)Yb{MSB`55My243JLh{I_ z%RMs;WT(w~!@7~(Uo!Bbn}^$FTSSgXF=2e!f?0WwS$dCYy2mWPZ&^H85wk3gSQejqB5GOM lZ@ka)f6^J*8i)Y1H8XlfS1^sAe=6OkXV?9K!IbRk{{fP^Q78Za literal 0 HcmV?d00001 diff --git a/document/__pycache__/text_extractor.cpython-312.pyc b/document/__pycache__/text_extractor.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6101c644baa8c7d8e61af246f09784dcfdfc3038 GIT binary patch literal 16783 zcmdsedvp^=ns2w%Ewx^jEz7q2mY*Q-1Di*{n1{hUV@xJ68SrGHHEscNWXszv2iqeD z@|Yd7O9nGTViU-*nam7gAi+6zcgK0{7&3P*dv?xQbp#_#hwQRYCJTi~kds;|DTs;=*=-}hDNpG+n_0io`a?_hWtLHrRPlt9NacN<88 z7$I1qjbKSu24x**)BkjF|_5n3@Ty^!5UsBSmOniTzWxHh_*bb&IEO7N?luizp9LIrI!;N z4R2xE%zKGrj~^oX)O(3T4j zTXJie)AV2cVEXNM^`GNlS{;-~wYYhoNHx1Ve4@IkyI*8>_W0c1?skuZ6safLeFq`c zvF*P04o^GJ!vg~kE}y&0wRwUH(HeXVrjy>Gv zATvU^2v`k7o9f%}HQUrK%0;uPMyQc;>UfTlavCVr?pNhvJ&DhU43}VKnJ|EZK;d8M{k9)JAl{`1`*_wHEV&;{%MP|R-YXz?e@V)MW)N;YsV=OjV^_$v)vb?<=q3WR#D&6e!%7I@b-55I%3NT z-bN4<{NZHW?S^QC@Wt!=L}zTFK}9_HkX{PPQseV|iii=Z+bGM}PdMY;`-y(yJlX0{ zi+VTj?&f{%-5oA29XN|>-p8GXZ_T?r`+00&zrVbe^Ll-?ySet~T%L{cs5f%1cAuB4 zWnVgEYI^!blhfJb<$X?fx7+7*4itZF#%pE~)Wf^@3W$ynx6HYtOTP;Usc8tXI3ceVi2^H5e5@be`Y} zF`ti-Nr@IfhF=fS4~T!GiX&+mFYSn!Eibi3OzAJRMA9t7OGnkiE01=EsDg+kFGS_Z zwf6EO+lQIa%oC;%ReqOJ)73I_HqX!!ELud>@ zll~AIBhX|#gvJCkmWR-!0ZryZXv{!keF#lD&}2!oA4bLQyu zkH)6Y{f(@0TSX1%kNxfg9m!&j3s6!AB4kDgKiNm}Dw1%6Xgi@q4ynZ)O5@5wp(Icx z;3}RIpj260Nj3A9O1pa!A!LEpIJ*R&pfv64r7U@df&Rze*iSfAtqx{@s_9|(570Fr zn|<&A3BHRFW#pkUV3x4bwR}E3e3^%Q|u(d+4Rz$LL!&wW2tOb#HNj#I86V5CbGRq%8b0ow%~+1cl$A+OLL{lVX)UM{;RC)QeJJRetsCcj*Xtn^~z( zqzdOw+|lNQK`QmT{AyH?gS3k9)BgB)o`zW()C_8UnFo+MrbWIV_`&m1BZ{OMqNkz)*AzVhm=$zOsxyY>Fx#{kipi?Z_S%|I%t z=GNpZ({GJVp9wVBdr>~%Sw^FB6KE>~&<4A-r#iG8Y6uQLnuIqvWW`x9!YP)B{vAJg z9HjyadoL;*`^?DSPyg_}&rhCeu(vua98Qs_arFUW<;9H60L)swzRdu!*i9Vg(A+C9B4xt;>p;}f-jP!OX@`?^RWWQgH_Zto#ceZU1kZc|@} zOM=0o#^XJ3z{QC)F!XW~pa8WbDF(zZB7=hG^zQ4FF{7lLa9pA;rlp_}E=pH>6^b@SI*>J6pTvr{KnwmqAOf6}nKP>U$+{utra9;M;iHGIne8_$Wx>kFCUq0v z50-BXS(<~!=9`)MBkNAA3ui7AG8cw27Y#8HL+0qRaDJTt{|$9FECs>hRg()R^MZxV zAMS>H_A%2b(`j?qRx8+QhqgsEL~dc&RwLMI0)2vQ<A7DJDxKwq(Q>@yXv^rDKyJV=(fAwnRMSVA$-ZFj)@#OX5o5+zQ5vZ3 z^C+p8OPZE!v=ARzbj`H-qx_0yLj4DeQO z6NX?>{luz?#leEL*VETUbwplyw46xK83N^yPgo1TAPlBONNPM`932eo4Lm;4|C`LI zZ+(KQlBkm0k}^sltW;>qA8)gq7^Hl0Y~`m?p(sTh{A$^&k|cmB=>dn}QiTA@ z5q3=p1JNLEng?O6L9JiQlgDyFDC3Go=7KRx`Ds8Uv|o$Z_>Q);gMPyAE}k;ky@oCTCwIcT4gnXUo##5 zDPtyKBjYkyO((i==7a>E?j}JI8~g^pdcU;o)e$^-2rS&a1PT9+l`0-mUmQP|6T>7o z2B!;e8o!9EW8^>@+9Fyq0zG+2_8puR#ErYS-zsL`A?o=2%#qu_cwzeOqqpDu zf&GboAg{Mi|M-|4oGv|{-mZNvuEE~p?y*bmV~vY2sbMcOBE(X_{S@CuI(*;Ng;nZm;)6#)+?9$ttU;5T;fcXvo0VX3IK zw;Fn!7)mmn1E=2f#2dFSy?X1zUw?KoAkR#LT~u-V1~j$OPzJQM@<;}l+SpJa7aPlf zu2vb$fJrWljRt6x;be*?m~dy)Q%&1inl`j-5)IqB&>tmRuKGmJM9K>|QB=eEgh=Bd z0}pVn8;)Q&9GP+;;q)Pl+YV!r4NpyP500E$hb8MVl5nQ>P&?P{?mi$p$~E%PMBNkY z{wH1SEKEMt>2-HIj7bKsDRv6t>}lr!T;kR*s-b&m3B!p8^hBgEC+ehUsUH&Pq|i!( za}jpG6TS~Hm+WF*GV~RD7*g5i?f|v}d9$%+-0uf0jzB$s6;R6&V%l7A!%`S5Y7bfV z1&#Y|+VX}>k<6?iCQ4Q7a&Behj})IM9?!j&u{glra8w0$UD|to?}Z}p$QKCdmA5Ku zC)B^vy`!6~`+en_6RpErN7;zIJZ!HM>~&!14sR24>=9c<*j6RjsseqJ%*4P{-qmea zR|lWk6Rh5Q-PRV>5)0Nwjf6F4r18g%5o^K7&J#N$CFO`O!zFb>N!`SP$@);q+DK`| zxvH~O;nI4cw0>gc}AhLmOH3(mEiZ3&kz6UvuO zv`tw;<;@XC)un>-1!2c3!Le$x=4xrku_IEwRJ^46HYu{7*x6da9V$9loBeyTY5h6LfEe-1bVQMO9Sx8@9v~f8kzbn>3+3yz8 zEp@uzRckTdKm%=R6Qz>+^GuY4VUi4|6o90pgDNm()L_b>&m`{IfFZ>K7#3i!4$^)) z6^QDLyL=UTf>1mFU}6-3SrGSXBq&nzQhEg7vH^W9sa(R)o7d6J3w_ptk(0tF0u;{x z1p^d1Ka1rjLhsk5>|09k_n>hm^}8DStxd(1Riy%A3AmcX<1dg0%w2ElxntEV zm6BH+?hn93vlPZjT+c`|8i(qMZvhaWO6B7A8#?38>;$!Z$OXosp)>BpYfk%*BJ{y4J0|_W{PeLKL8zP6E`!436#^`$xiOjRE zez<4C@*>mW?ZWFPJY3H}*ym#5dWMAVAeRnTR(yC#0Kgs`?Ct9p&460k4{+^02j#mN zqQTd`4{i`h7d0F?Vj7Mwfx+Ff~jl z*Mn6O`~svE>lI*P9FAyGMkQZZQA3AB14J-RxrD;biTZfAco~eL9!|ny3L3*jI@c2z zM`QSL-va{vLqPrr#|kAl_N|cR$)NGcTWQ%R`O#HBc|Md@Ii!vxz?p`~D;z62Q#7CJAC$VXu-Uwe zq_Fa4ocHc?d(Z9-d_PpN`g(dJcpz=1U+9VS!uZRB6{{wjf4%M9ZBt#L#@#~2?%;R6 zD^z^ZujpDSyOLeS{)t^xf zQis*DUYctVAG0XVByjyC?nvR=wg1((Kgj$()+)c(VMsEzL61C8O=7)tQBOWRh07wJ zpBhF>^Zoa3zy0>@4=>KV^3z+B?@8)bq7zB(KHclvU%gV)xDRxLJ~^)v8CQ1)Xt(YI za4d@FlYA>2VD5XC8-l6q#cg`}_@&z~eJCH;e)hp@pN+rp`N-R!|8&%$5;gKY2hpVP zaKaramxGclL(T(@x&07iNHd z=m-JUgU)mcddAkP&5^vKF~=##>BYm^FV%!) z8-o7g^5BXmgv@Wj!Df=#V#eBF#@4T*lq$iecXH45jLlce=kVD=hBGz`8Jqu;$0L*x zbCZ?$MMe{&{uKkc%M9I=p}lO?B-*i0KM~fb&r`I*gBvDc2U}wmY^nE`jARUoujG+px7en&%Tj z2hIH$NVyS+M9qPH^-JsOMQWe>!14hNN?F|s33_lRp#D6`y$VUI+~OzL>?B0@^CfY# zFFfS-9kk2xXO~5jbvV>g8&U6ZcgNrbY_|v7xfs--CE?;;2R@?Z+ud#uV^^~aW69$q z?fIgf@9}_-3pnr&CLw_6@R7w{-nK6G?F2Ha1{d*XbDe6#nWi}AxY*=A0Xl9ZOSA~~sT0G7fXF?Ahin>SF~_*H%y z58FYns5=dinG?X4!J|6O8*%IpM?0ESwkoTF)A*R?0`9O!3~IqbX8c-sS^`fxzwU*6 zu$NQ3C2=nyrM+rT-3r{}vG>Ubhl#riaGRCzGtF_tpwNNMyJMbl>O1jf&YX}qJW@Eo zo4P5`HWd5-AZ3gAJc0GEkJ(7ZBpYc@%dYutB-|cn*+}Tl?dk9H%r=me(%o>cUWYrd z)7$Oohi%)yA9T5ViuEEt;2kDDVMI;;>gTsUJOidwgMEOOY$;%3cn&W@@e&(JwavPg zjtd%v<8C8Fj}s&;Y(McmxCAi>_x(~`leqwN7+xbsNSLr}3NDzxr9MP**-+>pxtB0e ziK?0<+>anT>jI^&@qnw_)z`zV9jKYd#f$efYZ^V?j&={fwkA&RfW^S$HI&=LpN}; zoX!odawl4Gve~C@@9vijIfo%h3p2_u9g<8jed2?FX^3pOWB8s|Fv5wD?JRt^qLLw% zwKAIf^IBOrA@I`{f`@MdYI)jRBw1OHhb%2YW6P~f`*`jIdo6P{Kq9oUZUOY~3m4Z5 z#r2{5rEm``oL?j4*W97ilKRvWP?A@DEw6fLJ9th<+XU-EFsqEYVWV9z+Jj}_VxDN3 z(gaJkTr+OH36|F36Ne|34Iln}&dQj<^+`c77+X2{!JGvVTUpq)NU$xsxHRCquOz@MJgsltVy#ec`-1E5;t3!@;*RAXCWDvGh z&>+7iTJQ}F6%EXtq?B(!CRr(`0#Xk$R1`S%a1$jJ1d5qCurXlxuNpC|R=#UH2)`ma zRuqRgCBRySIG||e!#BO?i^2Lo!^n{Il|;wwgH{Qc19JQt6twoD$jyw5&-}MnZ(n*J zeu)%+F&u-bVQGHN){tMT*c#j@w1=yfXkXBC>1*2<^-i{l+6kGcp`2Bg!8I|ysE%=CievMszQ02Wz%8-co@L4sNN!EZk=se zn7~~h%%}vzg48ueY_$`*!1sgMtN(h-eI7kyBgUpu;?2^gZ1rWDf$ZgMx~W)uIbVzU zQf<@XdCY{1(DDE8%mnoV&3bLpeTq1gKmfrNCfBff1T3jw1Zz;XX(a_`tR8>|9?T2<1I>$xl;*|z^O_gW#$?ZwWZ-Yy_0)W_2iQMuTW|ouWG|**#2i2p{o!!9yA-0_bTq^ zSSgw8zrY-tGm?b>ccsy~;6A{1x;S^)uur*5)dJE%Q84QtLQ{gdrOx<4#gesbd|J=%VX8B05rHqM7^i?3O$ChLZD zc*(gkl3N-nOKn}$KE%3U=2{o4(YmNX>tZ!p7aR3z$%@cjS*nGSE30)Iwe&|C5@S8R zF;hDWh*27fz8i}3S6$^*CS<*V})IN5m)1yTN4poCJyEVv-={MwIY2Fhjwxs*iD;L<1nexRt$ znZwOt>mtFrC~U12thFI)UC3M?Wa_`}gv9BPrsOXmGiyrXw<5pk)WBpSVwe-2Bc!Rq zBdM7;kIanyTo&-y7CL%qdiWReb2&zZ80BMxqmZJ3GVa1NS_z+#Pk}er%qXZ;@ z-ick4<&*r>#>u{SYl6$S3botrVnx(PI)KU-B_NrgCpspVO{u4LO)>AT4X)TGEZKe+ zE22-4X)-TJQuBg}s6AgMf+RH&R74w)2;K)t>i0=RMwt^NsX0MKw4hiff+RH&RKOIe z=%wSnz{a!vcL~Tw85O;i9Pfz2!&WlD-jh=K!{9e@6i7=$RC(dV41e1qIgjD@&3Fq) zS_~9^IaDjs@)5Q8y9$qcpS(T{aEnO(B{*yv=n0W8pLb!-g;57aEJPxO2Sf7N*%5qZ z6qA<2eTBP(WjfFkEz%EcEj zC_|G4pHP{fQk9YHydyiK457~-$_eZ81YO?f%8;)7h~_4#|59ZltD^)&ch!W-5XHwY zi@GRsEBOT>C4Y);A_*oFd{1N^xW8W9f&I$+3S3okVSXel@5NRu%>OEy7Aq`>Waq!Q z6AKGu@6<2#O)H4MV>YOXe=u%HCoZqhZXm$pOhSA`jUMMKMiS$6?S?${m0S|y{{vI7 Bof`lE literal 0 HcmV?d00001 diff --git a/document/__pycache__/web_search.cpython-312.pyc b/document/__pycache__/web_search.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..171a31e2d0f142a3e9b1823dd74337300d47ef92 GIT binary patch literal 15500 zcmb_@dvsIRmG`}RzbwhNEWd5p#(-?F!MuWbg@-Y(I3a|WC!i&KWg{a?&b<;~A}5q2 zRBoJN(lj=O3?^xcXqcdOGL4gvgtn7TX7!IxicGBB()BT`e!$Eh6*1pxr}^jG`&?Z~ zc0`lRnqxTkJob5?v-fZBbM*VHEDZ(jzuxg4{(U_~{RtzYlP3bxHy|)bF%(0)s8-=i zx6%-oxFlWDR%w^4Rn{eMm3Jvx6*T5cUCJ(1tBQnWE_Ii>RZYUQOVg^6P!6ioEH;w* zI&^Q*Laa5*p^{L|6eEA0Via#lgw(eZq^8zf6PFUuQjHoM#H@!D~+IGc@CLG5fdc)Yyb<*|FOP}A7n-xk-i9*?*F z3D$np;o9hNdmV1?M%H2Xdf0lX&({U<`eTkmHjWIluDgfDT}V6(a8~eMp#DQ`+t^C~?7*?(u16b@w}P{8~P zYlSNztd$xr@QFMJW3zi(tXeh)$DE6v3B5e@ z@JzA!=;00}v&l=HZZC_S5*7bE8!M`hqGQO%z9S#z=B9tc~n*&%QZ(~ zEwYK92p5jZ3Wb`mk{L6vh6Hret#F=GGmoc(DP+hAalov%ORzWizg@z}o|m_(7&)F9 z$WzS7Q#>zgRWoqfTQyK#5)VC{(+0U9&Vn1MbYW+orP1!M%`*a{$}ut#Do;`3p2OzbAqq7}obuP9BR#|kkOC~*>JIgyg4dMK-^ zh1Yi3kBevaF0w0LpLTFPKJVSK^hDtRbG(dUI>0~Vbaz-)>^vlqb~wB&@;bH_Jv@JW zE-*!}uiMtmdXD$lgw^A9Gh!y2)9v&on5QjKzg5D^IESm9)Xd=`SS&0aa%A*`-gd&2 zwq5GP)C!>CPJnlkikVAZb`47cvVLu>xcuU(bE__{JGU;lW~4G&ye1$URTad_DlZ;6 zcjThyoF`nj_J$-{wh_~dXQi7zHHRO5ELygcWX2U#X~kH{(n!hDXvuO+Dug6;)|jdy zqN=#5sv>-ZSFkW}*4>r>0nj6g!bK!oH4cG6*r4BjQ#vVeOZC!zI^)KbFjA3*qkCiz zppuKQq*vmXaAt3EV|6OAi!&X-l|=txyU6@df>B}0Afxh2&Qq^RQf2+p zU6fVrLpFcs?T@BDe0%(ZH>L(o-+kdF2=j8XA9$t1Z9n94Fq>Og#0WmzcLLqSyT6vWp>hqI|~g`}!`&?av(r z?<93Qzj*NJvri8`d-mDkVnYXKeGrS0%h|9#C zfFCBe5ot-`Qo;74I&nkHat~;Qppt%Hb14^H*E3t5LKJ;?CcjZhLafP^%|ezOR5SsXa8 z-P+adVZBW(>tWe7n4%J!OqD_D#>*W@W0eXl0xM;=fX5*tPqKt15IJF)!4fh^wGCMA zBk+JHR8*npD2prP>I3v2brrFNi^mqOjVxRn*fgfAjOZ$dmq&Cp$N;wYZ4VbL8PzR~ zYvk%y`VYF|Z$qo1jHWR|^=F1^tYb&tj&R}fo4OToDP^4d*ZcXDvG_}hhNhD&<{v)S zJsFo`(pUG5iR{)?_P)<``!Tzf{u@^bL+D$&A(#57VZ#dQH@RgSmP&uKghqcw*~S9t zZ}VyJS!Cb@5VYZ3i4IR7#-|mqV?hYgoryCi65v-vDV)}uEX&a1sW?UZC4lmDgi|F2 zSgJhXEg8g_SZlA;FNKdQdM^ArQZ6$D%wSMw&hL8mcA zErycq%GAhPGjTmB9@bvDg!0S%Fgj6?_bR?!&3@Sl=yzuZo|bG|=|yFVIA}rP#Ri2~ zQpo$2eyLw}RsL3j`|qNFbD=&;9B{ySy{feNK~-s%V-((UG0C6g$1}meDXZ9qu0UaMN*HOceCYXw+LOdPxh7%}S=^lDMbZuhy{STa%^Bu_gyG8e#~ zh!dc*LSTg_a?<&fkhUL6acJ%U&SWs1dAjfE{@zh*I zqmlZpQN4+h{Ola06LkO#5D3%ZtOJjm?T%6$-Jj#9 z+dxa*1v&v(^CV>d-#ytk`Qnvy>DFSE5Iu@)A_yZ&7x)nQIWo`8Mmt+ z$haEtV42NDV`l59*&5g!Gvp1K#tQ2rh4r5q>SOt(FK5N&RDKOCve6^}$s`kovIvRD zB)kg#AeDfQR2SVV1+yp(=2A(o3``)v&N`@GgYrD)mtnbZAB!7%1UjzC-|Y7(;?qaaK_DJp2nUGNPh zl(S0!dzUh~0KJtzrGH2-qk2_-RVUiLW;qlASChuzUbQ#L*!@Xp2;0m#rx)cSIimk%q^L-h^C!*bG}Zr<40o3l|GsHoq`(2}90dxl#c<4y!4ca}(eT$_B0}-Z`zpP> zD3(C@S;U@!@GAxQpV}c+VovJIWY1pj4ysME9}c;9jz|UdEbPtthI(?mXiy=}pC#_? znL<>knUVHpJ(INy<_N!O0@dV07$V5U6v!t!0=ofnT4tEg@HFEtoJP~?AmvR0vec}c z0w&dxX1l-hs~08*e+8Ecx+g&TftiR1Kv2jPoU_C2>jE1(3-ZnewPW_NLG>t+?{gAQ_t%T7|a%$37yLHLA?4TMLr4@0tGG$3$5 z(Esmzd>$}GY6==Hf>HbYJ9j?*$FFBvX~BY}g}QhOtP8B89jsg)ribVhQ;-ZUzHaI# z=g_^k4p0Wt5{m_k9b<8eum}gqT>@e3spf@ggH9j@OeIew00@gG*J?`BU&wUY;bLUh z*5>s)9)E>=WprwcVCtc`bI^b_5J4pitFeifAwtE&sF=tzUTAleGH_ zQFb4Yb2vc)ED?Rdtg;!>PZT_8OFV}~UxuA|l8Vdf)n-tc{-7)TQbuPjK`>GnXogQ# z?wLpW9vPZ@vF==5uzpn67!!0K6mJV6g$qV?3*)lFEX(aO%YYW_DPy^FKFghxRHp{E z533`lnp>u#!R=?ahbtexk#|EGwrsg++Bz<#HI6JBnHRQf7&UGDkG#T2-u$8MpXJRD zZi-b~FYmv!Kj^zrJG?(y-7?TJUR@V_=JV>c=hQ>=(DGaJm&PjRU#_`S6Wkj*6s=qt zv&_A$xugj$4>d$B%VTqDE^oWEEqFA)?Id8a=qb>EvOGI4spL&aph6K zH`bz|thkXXv|QAm(+~IFELe6gmzuloFG|X6ja#Vf{6XDW-O$cg47YQO0xf^3hU9SG z{IAloer)*aJ}8GJ2#(C6Zdkl9=m_o&7cLvsEsy2q{b1`YL%}S9t8Jl*x7vgIK#v^N ztpK8zwh{nk{u{Rk*5=fDMN=L1YxR0_Qvvmx@(tNdI_bYIRzc)nSLQ&@Cpvjkp7N7y zC8T^(klR!%{iIfk>2+1jD(UZXRp=L2HcO?yTSG(m-=uQzXK0Ri(2!M4F9JUWJV2Yv zh_f=m4iF>>=GH6o%eZ`R5`<&|8|ikd4q)SqycaCN3L>H`v-x^s|L&?Pt<9HM0c@K#o+8GnLTJF5#DEng`G_6`<`=ok@WP^O|se)m7bF z3CPx~_Nz02AyK@8uvKRQFk*hM1|Xo8$?|KCQtWeSFh;{~EyVNl`GlDV?}8sr8DplCnuKCclo!OF|lR64K}aSY^=Qz8e)NO ztgi?1IxJV$@0o|H6pj6ugiZh5bys76kn6TNycG0~6cP z>j004_*7$}h_~ht^I_^vLJ;6+f+Y73#5_sFJ8xv|PR?O#_pmll`Mo~4Z|medZq|jg z$Zk?%o$bOXk>o_C6LD^wEzZ$y$tx2J%BuhdbbIhd9q-H7r_p;DDZmoSb;C?MSWqwo zsn2#}=qdDA^oSrgLP!+h;#UymTOlX2C_e-f^()Bb`V*pjk=hIvO=3?NJaYEPpy#Y- z*iH2RFLH}+8O?+Gv-)s(uapDqpC9Fe^C zz-Fjs!J@I+)t}X_9$?1I^FA}ryD=xw{Dr}cHEGXk!)2>Sc8@fNOE%p!HjRsE%R|0U zN4R9|sBzux{PIEnS%1t}0QLn?g<}OJV+FO5f?9~iwN$~JX-ZL`4{V9+sjR#)U3ElP zJ*t~`yR`b^fpZ7W+X5{yb3vf_wxMw7V8l>+tD+{jcBCd!+jOHNQrq%*#iM7p4y+$I zdTY+2SlOJ5o##4(=1@+wY-y~d^5V90+lGC?L(!7OA~aYDK*6#yR$hIpvgY!FOAFqp zi_NbMJ{4MW<=7PifPrfBz#{+#@=69<&bAD%xoKJ)&!s9KqvK}ER2*pe>-cvO;>cf4 zt!OHz-rqC_{7=f&oAacf%&**Rl>SabL-=<_IrzL1pMAik7@wDRb8PE&u<+Cd-h*V8 z*r#E&2T?Vbrde*G!PyQhw=_rr*gw63trX?`Ea!)dNRjXN%0ZHW%@Pf7X_t~{6rAN4 z+Ajmktt4gj^~?XBDFJAuuch7R^lz7e@=iE+(zF1a`w6hn%EYEJ-M53S7~7S5vqdU@ z(%w4DhMY76_+=gNQVh{WN@8YuVBDwO!%hB0WD!nqT%x= z<0C&8Osnbk(6{?10|Skg6B5gOAEGV`PzkTKtZ8IIImzt{nx z7OPHBtoCDa%1pWoqr`4GlXnSkdGM-$-3gvmDX_BBnDG5%8DbzMJc{rmT=fJW5)7XC ziMyXJP(_^Ga6gpTx+kjAwTuT4{U_AMeF7fX%5qErWz4edGt1Ty)7Z+bk(FD+s?tB@ zl?0mN5`9)qtfKmI@ulM6?(ovB(TZ&Y`dCH%fO1HFR-a%yL+;@2U~{P9owkv>vBt+D zjgLi_?+h<{JpA~9aLK`&Mq6A>nM>o9lrcX5w#!1wRQe^Q$;!dvi07X{bRODvox8F4 zgQFvd!==rm#x3I&i(^IQv68Y_WmU{l6|1z~vdq7%yQF(V54@o=2Yc|U{;RkWDiVyG z=hiFMS4nE6i8=23V0x$ES;`@feg-%p9Z_B42^Jr zBr7ww!6-#!1h_O~FW`k`qHkcTR43{Ll>K+=)TT;2pw2Xt6;Nd0Sa6!NUQga$ff)&B zk+?L}m&w?f)K`Vm0`dKqZsI?@hk+_QdHICf)(>nA5)`3P)BcL(1gRSM0K;KYwlL(6Hk3we`24BmNZz%teM-krW*CTP&)DD+_J z{hv>K{4Y?>BD5ehZ9y^x@9Hn|EJ<(S3jNQs6G~C3seXm1TfEuYbc#*;Ms_FZg76c z6R8o)hKpB^>eeJG5+b*?x!+8<609=N7q#(>zEA%pB`eU}r(|mFeLYoFc7eIrb*?Me z41g$FxGbQL8A^u^4Lub$RD@L(f{_oo{S4bBGJV33ZioyCAVem{A__}jQ~#2%n@iCM zkm-sBE@*Q7l8j0xO)h{XuO*Kr9Ooq5$XpxPuVqER!bH#lJpd3*EcbFSBxG#IF9*<| z02?{(wOI~eLt5*Z%QH$Si*k9Eo2fEBpsWhYCP7ZBf^Stec|QTlE+C`+cL6p@*lb3V z2^;~COR!^5SV1ke>Ut~V=TmKERq}oN&uV3(@9yTH^!pm3V71+4s>zAT)J; z1Q;p2I7z>g7JBoilc%gIUhecdx;O#AAU6oW0NZ4D6g6MK}ov@~(n!ih04pYO$pew)}O3K2j!rNsF#>yHZ zWeuN~E$wRwG!3kPS7c|Neev0tv0}_PKVqC8u3dLSK630vd)WHOsIg@nG@BLI)?HaQ z(lD|e(VBBli)I>?18LwnwR@)u8rTEYuAj+!o2Etg)Kuk08t_v2vU?_K(Rw=IJ=4?I z6E3WY8fwETHoTbyU%0iz8&c8XSs;AE3}c^p$MHNqLdhVuQyfDKiMzE|l6FTT`5%;* zrXYYn2@C+dOH#ZfrKm8{@7{vgQ$Nibq1GheNVRqXwkadOAcvDD6TVgjk(qaX`0m7+ z#M?4qvkS0d#!WctJcvG~H-opGqG3w?)B!N&_0p-SfoV-&0-!`EcI{PqkOn4E3^(A^ zKS__#S7^NMPHF^KB)p_<5jk5kZjschQQ{r%X4lTVKZ4iC;=xFO*wigjvdCiG_=p|u zP=7s1g^Tw^9^M<;dtu|S^0MKQVXR~k#Q&Q-_dHczLC+X5j)aW35dz5(10y02av^QD z10iB$sKhAq@SYz>2}2wN$`-h3B6`m-MAFB{z77fhgON4h$k3Q7jLdQ%(y(>-*r$8{ zkL~xipJqekzc~~or?QLbSCqvM7#hAWNDd7&xCad_F{m7I>DY@<5Jpr9A9fhL|BVqn zG;>6?&dAQgH#&mNV++>bT(BM@iG?y3&_w05%Ef0k37o{|aK5%Sc)r{2bBQ2}wLtIq z+ytOXyri)CI0wTQ!tZ-nF!hPA;ZrsZs96eoidPU5ASdk60jx+04}Zni!$c9q>e-)x zH}lOfpY!$Yd7>?DgRh^{93yLGf`)=sR9^M^!c=zbTd!xnsq6PZ`S+`0p z5zB#4-B?3&q@j7VVQbWKAX<7LtSY*%px|{(S+sOfta53rwB}aTysOektuwsSHny`p zva@}3=iz9rGdkZHt?K;Zv4g?V>$xKbqbqkrtUK;$6-G^*Qm8e6{v32Wok|cGEA%Ys zkZ}^mino#Jj5YJEjClGP`p6~~yj+yEsruwL9an#TdRwXXi$pJ6Q_(LsJ zTUeYU;U`-tYS?AyEk+MnCOZc`yc%JN{pV2(;fxAaY!o3ZnvB^#^nQ%q1@x|hhqvz@ z7kdrkzeEptoS+M!4#&zRdC-GeWZNFGzDq# zf?Rq2@V+1uS~4OHJ#@t#sa`WpLE4u!dGd$u)m6xIhBn42_yiA!xGP;G7;fG>jY;v% zv|Mf)J|2=@>Yb(_7}qbBn}@676nsKZ3_c^9Zj?k;w@hO=-cC0N`S8Jf_$2coeD5%A zmgfh(p^aC1;uHkKJNJiM4@7nz6k?t3u!oIiD8!d1Z); zQ}Fpz_UW#FSCe3PPiy!e4o3FarZFSlsF!mzw3-{2 z37JA|;Z@ti%eO}s?}$^765hExyk|eu6y9?%vJ+?_EnXy%*AFq%6nx?;iG0bhH@NXq z&wYq55f%m4!*>xb(GuE*Wx^zaIAPdDC@kaZ{9h3pjEJ@BSL{F$nQV-fZW zHyfugPYqjl2l>^8u=`eHHljIULy`5LxhHtlYT*~4YuWdJ4tWH37(95sOw)g$^ziTh zQVXKgfY|9cXy~D+dd^A3ZCdjeiIT2>*MxM%w3L!)<0P5` aA IntegratedContent: + """整合多个文档 + + Args: + documents: 提取的文档列表 + + Returns: + IntegratedContent: 整合后的内容 + """ + if not documents: + return IntegratedContent( + documents=[], + document_count=0, + total_content_length=0, + document_types={}, + combined_content="", + content_summary="没有提供文档内容", + key_topics=[] + ) + + # 统计文档类型 + document_types = {} + for doc in documents: + ext = doc.file_type.lower() + document_types[ext] = document_types.get(ext, 0) + 1 + + # 合并内容 + combined_content = self._combine_content(documents) + total_length = len(combined_content) + + # 生成摘要 + content_summary = self._generate_summary(documents) + + # 提取关键主题 + key_topics = self._extract_key_topics(combined_content) + + return IntegratedContent( + documents=documents, + document_count=len(documents), + total_content_length=total_length, + document_types=document_types, + combined_content=combined_content, + content_summary=content_summary, + key_topics=key_topics + ) + + def _combine_content(self, documents: List[ExtractedDocument]) -> str: + """合并文档内容""" + combined = [] + + for i, doc in enumerate(documents, 1): + combined.append(f"=== 文档 {i}: {doc.filename} ===") + combined.append(f"文件类型: {doc.file_type}") + combined.append(f"文件大小: {doc.file_size} 字节") + combined.append(f"提取时间: {doc.extracted_at}") + combined.append("") + combined.append("内容:") + combined.append(doc.content) + combined.append("") + combined.append("=" * 50) + combined.append("") + + return "\n".join(combined) + + def _generate_summary(self, documents: List[ExtractedDocument]) -> str: + """生成内容摘要""" + if not documents: + return "没有文档内容" + + summary_parts = [] + summary_parts.append(f"共处理了 {len(documents)} 个文档:") + + for i, doc in enumerate(documents, 1): + content_preview = doc.content[:100] + "..." if len(doc.content) > 100 else doc.content + summary_parts.append(f"{i}. {doc.filename} ({doc.file_type}): {content_preview}") + + return "\n".join(summary_parts) + + def _extract_key_topics(self, content: str) -> List[str]: + """提取关键主题(简单的关键词提取)""" + if not content: + return [] + + # 简单的中文关键词提取 + # 这里可以根据需要使用更复杂的NLP方法 + words = re.findall(r'[\u4e00-\u9fff]+', content) + + # 统计词频 + word_count = {} + for word in words: + if len(word) >= 2: # 只考虑长度>=2的词 + word_count[word] = word_count.get(word, 0) + 1 + + # 返回出现频率最高的前10个词 + sorted_words = sorted(word_count.items(), key=lambda x: x[1], reverse=True) + return [word for word, count in sorted_words[:10] if count > 1] \ No newline at end of file diff --git a/document/content_transformer.py b/document/content_transformer.py new file mode 100644 index 0000000..17cffa7 --- /dev/null +++ b/document/content_transformer.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +内容转换器模块 +使用LLM将解析的文档内容转换为标准化的景区和产品资料格式 +""" + +import logging +from typing import Dict, Any, Optional, List +from dataclasses import dataclass +from datetime import datetime +import uuid + +from .content_integrator import IntegratedContent +from core.ai.ai_agent import AIAgent +from core.config.manager import ConfigManager +from utils.file_io import OutputManager + +logger = logging.getLogger(__name__) + +@dataclass +class TransformedContent: + """转换后的内容""" + original_content: IntegratedContent + transformed_text: str + format_type: str + transformation_metadata: Dict[str, Any] + transformed_at: datetime + +class ContentTransformer: + """内容转换器 - 将整合的内容转换为指定格式""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + self.config = config or {} + self.supported_formats = { + 'attraction_standard': self._transform_to_attraction_standard, + 'product_sales': self._transform_to_product_sales, + 'travel_guide': self._transform_to_travel_guide, + 'blog_post': self._transform_to_blog_post, + 'summary': self._transform_to_summary + } + + def transform_content(self, + integrated_content: IntegratedContent, + format_type: str = 'summary', + custom_prompt: Optional[str] = None) -> TransformedContent: + """转换内容 + + Args: + integrated_content: 整合后的内容 + format_type: 转换格式类型 + custom_prompt: 自定义提示词 + + Returns: + TransformedContent: 转换后的内容 + """ + if format_type not in self.supported_formats: + raise ValueError(f"不支持的格式类型: {format_type}") + + logger.info(f"开始转换内容,格式: {format_type}") + + # 执行转换 + transform_func = self.supported_formats[format_type] + transformed_text = transform_func(integrated_content, custom_prompt) + + # 生成转换元数据 + transformation_metadata = { + 'format_type': format_type, + 'source_document_count': integrated_content.document_count, + 'source_content_length': integrated_content.total_content_length, + 'transformed_content_length': len(transformed_text), + 'key_topics_used': integrated_content.key_topics, + 'custom_prompt_used': custom_prompt is not None + } + + return TransformedContent( + original_content=integrated_content, + transformed_text=transformed_text, + format_type=format_type, + transformation_metadata=transformation_metadata, + transformed_at=datetime.now() + ) + + def _transform_to_attraction_standard(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str: + """转换为景点标准格式""" + template = """ +# 景点信息整理 + +## 基本信息 +- 文档来源: {document_count}个文档 +- 主要主题: {key_topics} + +## 详细内容 +{combined_content} + +## 内容摘要 +{content_summary} + +--- +*基于提供的文档整理,如需更多信息请参考原始文档* +""" + + return template.format( + document_count=content.document_count, + key_topics=", ".join(content.key_topics[:5]), + combined_content=content.combined_content, + content_summary=content.content_summary + ) + + def _transform_to_product_sales(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str: + """转换为产品销售格式""" + template = """ +# 产品销售资料 + +## 产品特色 +基于{document_count}个文档的信息整理: + +{content_summary} + +## 详细介绍 +{combined_content} + +## 关键卖点 +{key_topics} + +--- +*内容整理自提供的文档资料* +""" + + key_points = "\n".join([f"• {topic}" for topic in content.key_topics[:8]]) + + return template.format( + document_count=content.document_count, + content_summary=content.content_summary, + combined_content=content.combined_content, + key_topics=key_points + ) + + def _transform_to_travel_guide(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str: + """转换为旅游指南格式""" + template = """ +# 旅游指南 + +## 概述 +{content_summary} + +## 详细信息 +{combined_content} + +## 重要提示 +- 信息来源: {document_count}个文档 +- 关键主题: {key_topics} + +--- +*本指南基于提供的文档整理,出行前请核实最新信息* +""" + + return template.format( + content_summary=content.content_summary, + combined_content=content.combined_content, + document_count=content.document_count, + key_topics=", ".join(content.key_topics[:5]) + ) + + def _transform_to_blog_post(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str: + """转换为博客文章格式""" + template = """ +# 博客文章 + +## 前言 +本文基于{document_count}个文档资料整理而成。 + +## 主要内容 + +{combined_content} + +## 总结 +{content_summary} + +## 相关主题 +{key_topics} + +--- +*本文内容整理自多个文档资料* +""" + + topics_list = "\n".join([f"- {topic}" for topic in content.key_topics[:10]]) + + return template.format( + document_count=content.document_count, + combined_content=content.combined_content, + content_summary=content.content_summary, + key_topics=topics_list + ) + + def _transform_to_summary(self, content: IntegratedContent, custom_prompt: Optional[str] = None) -> str: + """转换为摘要格式""" + template = """ +# 文档内容摘要 + +## 文档统计 +- 文档数量: {document_count} +- 文档类型: {document_types} +- 内容长度: {content_length}字符 + +## 内容摘要 +{content_summary} + +## 关键主题 +{key_topics} + +## 完整内容 +{combined_content} +""" + + doc_types = ", ".join([f"{k}({v}个)" for k, v in content.document_types.items()]) + topics_list = "\n".join([f"• {topic}" for topic in content.key_topics]) + + return template.format( + document_count=content.document_count, + document_types=doc_types, + content_length=content.total_content_length, + content_summary=content.content_summary, + key_topics=topics_list, + combined_content=content.combined_content + ) + + def get_supported_formats(self) -> List[str]: + """获取支持的格式列表""" + return list(self.supported_formats.keys()) + + def add_custom_format(self, format_name: str, transform_func): + """添加自定义格式""" + self.supported_formats[format_name] = transform_func + logger.info(f"添加自定义格式: {format_name}") \ No newline at end of file diff --git a/document/text_extractor.py b/document/text_extractor.py new file mode 100644 index 0000000..5c60d0a --- /dev/null +++ b/document/text_extractor.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +""" +文本提取器模块 +支持从PDF、Word、TXT等格式的文档中提取文本内容 +""" + +import os +import logging +from typing import List, Dict, Any, Optional +from pathlib import Path +from dataclasses import dataclass +from datetime import datetime + +# 导入依赖库 +try: + import PyPDF2 + import pdfplumber + PDF_AVAILABLE = True +except ImportError: + PDF_AVAILABLE = False + +try: + from docx import Document + DOCX_AVAILABLE = True +except ImportError: + DOCX_AVAILABLE = False + +try: + import openpyxl + from openpyxl import load_workbook + EXCEL_AVAILABLE = True +except ImportError: + EXCEL_AVAILABLE = False + +logger = logging.getLogger(__name__) + +@dataclass +class ExtractedDocument: + """提取的文档数据""" + filename: str + file_type: str + content: str # 纯文本内容 + metadata: Dict[str, Any] # 文档元数据 + extracted_at: datetime + file_size: int + page_count: Optional[int] = None + + def __post_init__(self): + # 确保content是字符串 + if not isinstance(self.content, str): + self.content = str(self.content) + +class TextExtractor: + """文本提取器 - 只做纯文本提取,保留所有原始内容""" + + def __init__(self, config: Optional[Dict[str, Any]] = None): + self.config = config or {} + self.supported_formats = { + '.pdf': self._extract_pdf, + '.docx': self._extract_docx, + '.doc': self._extract_doc, + '.txt': self._extract_txt, + '.md': self._extract_txt, + '.xlsx': self._extract_xlsx, + '.xls': self._extract_xls, + '.csv': self._extract_csv + } + + def extract(self, file_path: str) -> ExtractedDocument: + """提取单个文件的文本内容""" + path_obj = Path(file_path) + + if not path_obj.exists(): + raise FileNotFoundError(f"文件不存在: {file_path}") + + file_ext = path_obj.suffix.lower() + if file_ext not in self.supported_formats: + raise ValueError(f"不支持的文件格式: {file_ext}") + + try: + # 获取文件信息 + file_size = path_obj.stat().st_size + + # 提取文本内容 + extractor = self.supported_formats[file_ext] + content, metadata = extractor(path_obj) + + return ExtractedDocument( + filename=path_obj.name, + file_type=file_ext, + content=content, + metadata=metadata, + extracted_at=datetime.now(), + file_size=file_size, + page_count=metadata.get('page_count') + ) + + except Exception as e: + logger.error(f"提取文件 {file_path} 时出错: {str(e)}") + raise + + def extract_batch(self, file_paths: List[str]) -> List[ExtractedDocument]: + """批量提取多个文件的文本内容""" + results = [] + + for file_path in file_paths: + try: + result = self.extract(file_path) + results.append(result) + logger.info(f"成功提取文件: {file_path}") + except Exception as e: + logger.error(f"提取文件 {file_path} 失败: {str(e)}") + # 创建错误记录 + error_doc = ExtractedDocument( + filename=Path(file_path).name, + file_type=Path(file_path).suffix.lower(), + content=f"提取失败: {str(e)}", + metadata={"error": str(e)}, + extracted_at=datetime.now(), + file_size=0 + ) + results.append(error_doc) + + return results + + def _extract_pdf(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取PDF文件的纯文本内容""" + if not PDF_AVAILABLE: + raise ImportError("需要安装 PyPDF2 和 pdfplumber: pip install PyPDF2 pdfplumber") + + content_parts = [] + metadata = {} + + try: + # 使用pdfplumber提取文本(更好的文本提取) + with pdfplumber.open(file_path) as pdf: + metadata['page_count'] = len(pdf.pages) + + for page_num, page in enumerate(pdf.pages, 1): + page_text = page.extract_text() + if page_text: + content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n") + + # 获取文档元数据 + if pdf.metadata: + metadata.update({ + 'title': pdf.metadata.get('Title', ''), + 'author': pdf.metadata.get('Author', ''), + 'subject': pdf.metadata.get('Subject', ''), + 'creator': pdf.metadata.get('Creator', ''), + 'producer': pdf.metadata.get('Producer', ''), + 'creation_date': pdf.metadata.get('CreationDate', ''), + 'modification_date': pdf.metadata.get('ModDate', '') + }) + + except Exception as e: + logger.warning(f"pdfplumber提取失败,尝试使用PyPDF2: {str(e)}") + + # 备用方案:使用PyPDF2 + with open(file_path, 'rb') as file: + pdf_reader = PyPDF2.PdfReader(file) + metadata['page_count'] = len(pdf_reader.pages) + + for page_num, page in enumerate(pdf_reader.pages, 1): + page_text = page.extract_text() + if page_text: + content_parts.append(f"=== 第 {page_num} 页 ===\n{page_text}\n") + + # 获取文档元数据 + if pdf_reader.metadata: + metadata.update({ + 'title': pdf_reader.metadata.get('/Title', ''), + 'author': pdf_reader.metadata.get('/Author', ''), + 'subject': pdf_reader.metadata.get('/Subject', ''), + 'creator': pdf_reader.metadata.get('/Creator', ''), + 'producer': pdf_reader.metadata.get('/Producer', ''), + 'creation_date': pdf_reader.metadata.get('/CreationDate', ''), + 'modification_date': pdf_reader.metadata.get('/ModDate', '') + }) + + content = '\n'.join(content_parts) if content_parts else "" + return content, metadata + + def _extract_docx(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取DOCX文件的纯文本内容""" + if not DOCX_AVAILABLE: + raise ImportError("需要安装 python-docx: pip install python-docx") + + doc = Document(str(file_path)) + content_parts = [] + metadata = {} + + # 提取所有段落文本 + for paragraph in doc.paragraphs: + if paragraph.text.strip(): + content_parts.append(paragraph.text) + + # 提取表格内容 + for table in doc.tables: + table_content = [] + for row in table.rows: + row_content = [] + for cell in row.cells: + row_content.append(cell.text.strip()) + table_content.append('\t'.join(row_content)) + if table_content: + content_parts.append('\n=== 表格 ===\n' + '\n'.join(table_content) + '\n') + + # 获取文档属性 + core_props = doc.core_properties + metadata.update({ + 'title': core_props.title or '', + 'author': core_props.author or '', + 'subject': core_props.subject or '', + 'keywords': core_props.keywords or '', + 'comments': core_props.comments or '', + 'created': str(core_props.created) if core_props.created else '', + 'modified': str(core_props.modified) if core_props.modified else '', + 'last_modified_by': core_props.last_modified_by or '', + 'paragraph_count': len(doc.paragraphs), + 'table_count': len(doc.tables) + }) + + content = '\n'.join(content_parts) + return content, metadata + + def _extract_doc(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取DOC文件的纯文本内容""" + # DOC格式较复杂,建议转换为DOCX或使用专门的库 + logger.warning("DOC格式支持有限,建议转换为DOCX格式") + + # 尝试读取为文本文件 + try: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + content = file.read() + except: + with open(file_path, 'r', encoding='gbk', errors='ignore') as file: + content = file.read() + + metadata = {'format': 'doc', 'encoding_note': '可能存在编码问题'} + return content, metadata + + def _extract_txt(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取TXT/MD文件的纯文本内容""" + encodings = ['utf-8', 'gbk', 'gb2312', 'big5', 'utf-16'] + content = "" + used_encoding = "" + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as file: + content = file.read() + used_encoding = encoding + break + except UnicodeDecodeError: + continue + + if not content: + # 最后尝试忽略错误 + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + content = file.read() + used_encoding = 'utf-8 (with errors ignored)' + + metadata = { + 'encoding': used_encoding, + 'line_count': len(content.splitlines()), + 'char_count': len(content) + } + + return content, metadata + + def _extract_xlsx(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取XLSX文件的纯文本内容""" + if not EXCEL_AVAILABLE: + raise ImportError("需要安装 openpyxl: pip install openpyxl") + + workbook = load_workbook(file_path, read_only=True) + content_parts = [] + metadata = { + 'sheet_count': len(workbook.sheetnames), + 'sheet_names': workbook.sheetnames + } + + for sheet_name in workbook.sheetnames: + sheet = workbook[sheet_name] + content_parts.append(f"\n=== 工作表: {sheet_name} ===\n") + + for row in sheet.iter_rows(values_only=True): + row_content = [] + for cell in row: + if cell is not None: + row_content.append(str(cell)) + else: + row_content.append("") + if any(cell.strip() for cell in row_content): # 跳过空行 + content_parts.append('\t'.join(row_content)) + + content = '\n'.join(content_parts) + return content, metadata + + def _extract_xls(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取XLS文件的纯文本内容""" + logger.warning("XLS格式支持有限,建议转换为XLSX格式") + + # 简单的文本提取 + try: + with open(file_path, 'rb') as file: + content = file.read().decode('utf-8', errors='ignore') + except: + content = f"无法读取XLS文件: {file_path}" + + metadata = {'format': 'xls', 'note': '可能存在格式问题'} + return content, metadata + + def _extract_csv(self, file_path: Path) -> tuple[str, Dict[str, Any]]: + """提取CSV文件的纯文本内容""" + encodings = ['utf-8', 'gbk', 'gb2312'] + content = "" + used_encoding = "" + + for encoding in encodings: + try: + with open(file_path, 'r', encoding=encoding) as file: + content = file.read() + used_encoding = encoding + break + except UnicodeDecodeError: + continue + + if not content: + with open(file_path, 'r', encoding='utf-8', errors='ignore') as file: + content = file.read() + used_encoding = 'utf-8 (with errors ignored)' + + # 计算行数和列数 + lines = content.splitlines() + row_count = len(lines) + col_count = len(lines[0].split(',')) if lines else 0 + + metadata = { + 'encoding': used_encoding, + 'row_count': row_count, + 'estimated_col_count': col_count + } + + return content, metadata + + def get_supported_formats(self) -> List[str]: + """获取支持的文件格式列表""" + return list(self.supported_formats.keys()) + + def is_supported(self, file_path: str) -> bool: + """检查文件格式是否支持""" + return Path(file_path).suffix.lower() in self.supported_formats \ No newline at end of file