From 074a53ad8361a7abc18f04bfeb1397265cbb0f5e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 24 Oct 2023 22:25:27 +0530 Subject: [PATCH 1/8] Support regex in ABOUT resource paths Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/models.py | 9 ++++++++ scanpipe/pipes/d2d.py | 19 ++++++++--------- .../tests/data/d2d/about_files/expected.json | 20 +++++++++--------- .../d2d/about_files/from-with-about-file.zip | Bin 6468 -> 2560 bytes scanpipe/tests/test_models.py | 2 ++ 5 files changed, 30 insertions(+), 20 deletions(-) diff --git a/scanpipe/models.py b/scanpipe/models.py index c8706f6ff2..d8aa9ba505 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -1779,6 +1779,15 @@ def path_pattern(self, pattern): """Resources with a path that match the provided ``pattern``.""" return self.filter(path__regex=posix_regex_to_django_regex_lookup(pattern)) + def path_patterns(self, patterns): + """Resources with a path that match the provided ``pattern``.""" + lookups = Q() + for resource_pattern in patterns: + lookups |= Q( + **{"path__regex": posix_regex_to_django_regex_lookup(resource_pattern)} + ) + return self.filter(~lookups) + def has_directory_content_fingerprint(self): """ Resources that have the key `directory_content` set in the `extra_data` diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index e7235f82d9..5a03a8712b 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -790,8 +790,8 @@ def _map_about_file_resource(project, about_file_resource, to_resources): ) return - filename = package_data.get("filename") - if not filename: + files_pattern = package_data.get("filename") + if not files_pattern: # Cannot map anything without the about_resource value. project.add_error( description="ABOUT file does not have about_resource", @@ -800,12 +800,12 @@ def _map_about_file_resource(project, about_file_resource, to_resources): ) return - ignored_resources = [] + ignored_resource_patterns = [] + codebase_resources = to_resources.path_pattern(pattern=files_pattern) if extra_data := package_data.get("extra_data"): - ignored_resources = extra_data.get("ignored_resources") + ignored_resource_patterns = extra_data.get("ignored_resources") # Fetch all resources that are covered by the .ABOUT file. - codebase_resources = to_resources.filter(path__contains=f"/{filename.lstrip('/')}") if not codebase_resources: # If there's nothing to map on the ``to/`` do not create the package. project.add_warning( @@ -819,11 +819,10 @@ def _map_about_file_resource(project, about_file_resource, to_resources): return # Ignore resources for paths in `ignored_resources` attribute - if ignored_resources: - lookups = Q() - for resource_path in ignored_resources: - lookups |= Q(**{"path__contains": resource_path}) - codebase_resources = codebase_resources.filter(~lookups) + if ignored_resource_patterns: + codebase_resources = codebase_resources.path_patterns( + patterns=ignored_resource_patterns + ) # Create the Package using .ABOUT data and assigned related codebase_resources pipes.update_or_create_package(project, package_data, codebase_resources) diff --git a/scanpipe/tests/data/d2d/about_files/expected.json b/scanpipe/tests/data/d2d/about_files/expected.json index 70eefb8b73..af0e01ef8d 100644 --- a/scanpipe/tests/data/d2d/about_files/expected.json +++ b/scanpipe/tests/data/d2d/about_files/expected.json @@ -61,7 +61,7 @@ "source_packages": [], "extra_data": { "ignored_resources": [ - "flume-ng-node-1.9.0.jar-extract/org/apache/flume/node/ConfigurationProvider.class" + "*flume-ng-node-*.jar-extract/org/apache/flume/node/ConfigurationProvider.class" ] }, "package_uid": "", @@ -112,9 +112,9 @@ "status": "not-deployed", "tag": "from", "extension": ".ABOUT", - "md5": "04c25308d59068db649ebfd5d8103338", - "sha1": "9625fc925a01cfa22b6e1ce083a1b1802c2ce78c", - "sha256": "46142b274cda38e9c183001b58a72c8fc442b2162ae85e9a453a2c7a1a86d427", + "md5": "4f8ac19bc3661bbaac91fb8652b6c4cb", + "sha1": "ac27839cc3010c96b34796be83bc2e6b6bc1882b", + "sha256": "055133491484d680991ed98ba1f791e88f4552955e15d9c5b3b14c5d46b4ee16", "sha512": "", "programming_language": "", "is_binary": false, @@ -143,9 +143,9 @@ "status": "about-mapped", "tag": "from", "extension": ".ABOUT", - "md5": "b1d5c62c364d4470557bfba7d0338758", - "sha1": "828f79d9fc0619a5b869c46a54b10ee3573a00bb", - "sha256": "de514210e135dddffb6ace69aa5fe27e1873146e05eeb5b05c6de1f8c00b0010", + "md5": "c7fab493a90ebf247954e1e30582ba8f", + "sha1": "3090f7d036bf68c5421364ac03a094715348f9de", + "sha256": "71f10662c0806de172a04a07fffbb3d22bd72ddacef4188849a480afd4e46849", "sha512": "", "programming_language": "", "is_binary": false, @@ -174,9 +174,9 @@ "status": "about-mapped", "tag": "from", "extension": ".LICENSE", - "md5": "2b42edef8fa55315f34f2370b4715ca9", - "sha1": "58853eb8199b5afe72a73a25fd8cf8c94285174b", - "sha256": "43070e2d4e532684de521b885f385d0841030efa2b1a20bafb76133a5e1379c1", + "md5": "94c82ae800466538d15278d6be4feedc", + "sha1": "3837fdbc9d942bcd1c5f2d419148e944f7ce996a", + "sha256": "a4da19948e6906fa8af95a258b9a354f641adc6215956f0ec63f429a10f0f603", "sha512": "", "programming_language": "", "is_binary": false, diff --git a/scanpipe/tests/data/d2d/about_files/from-with-about-file.zip b/scanpipe/tests/data/d2d/about_files/from-with-about-file.zip index ccce7aedaedd2eb5ce14544be5db529a2412c62b..7dbed4ca63fc9d42a3d0ceddee26a1ec482e7b4a 100644 GIT binary patch delta 1027 zcmX?N)F2`o;LXe;!oUH9vlm5#GeAK(BLjmXgTiEearOH1H_cOLeKJp-0>q^i+zgB? zFPRw_z(i}nX}`k;0(*XIe`DWMd4c%{i<7NF=+l6A6Fs-(NgdC=>Djpd_@-!16_uKZ zN_peWn}5x{zLZ;}lGj$#;#R5J%bpUO2{|9+vTa@~%$XN_ebdc3M_Zj_Zfa(1IQq)~_`>YAROanc}%=#`V8Tct!2(C8ut* zU)Ud>Y5nbqQt6kMliN2<_4^=vQY`yng(vgf!Y%a?)d!-)ZVP^2pRa%ZwsdR=lfxmG z`29B&_7=4-k7)n2KQ}h$ru?=?(gEJ=93NxXR$m7OhcqZG5TW#J10s~}PwryW zt=|a@7Z}C}A7B5&3KIK1YoF-zZP;41QT^!39RXkzuTUhjgnRZyFX;ocs+*>FTVSn@RYu)}eJWEn?SIIlckrYY zM~T-7$7|{W_1r#ZBsMzV&N8)A`dv3}XkwXu>6=is4e2()=_?fd~4%bGHHPw%nH!e*)=b5y{Ps^jqdHJc< z-?M-1h?0(N{u*#FwS#v_$jkIK52BL{T^z1v2_L?1Ht93_zSL}^%s)Rq{o8Xp{^>O# z>mR!U7Z)y-n>Wq7%A=sF_Rsz207Xm+cozeEySruYnWMFV`rRL%#OAdI&@87d0Mqe&LNX8 zic2ek5((V5Kpz9M63|2lSkmam$S`>VqdE^P(ST^6HcRHcObnBlHQ^xxHVPCl$j(wm sGOD4mgAHVqEGVFOkOK;&b@C|*Ic7I@hyo5teuOXNCFR-nv4flg0Oh2UegFUf delta 4958 zcmai&cQhPKyTG@wyLyeqTD?RFL9n9t-b>UVM2otj220dnwbfg+=ru}05M4xw5<+y* ziJBelIFrHnd|A`}EPO@*J9%B3}VZJizbPq5sK5|oh6i|dsP4^;F(*x+ol-sIv(jyqME{p6-Y2mal<6I3<) z_qICKtZ6i6tx@6Lp0CJ2dDQ5X?^GBih5kqyL{o=JBTl|J*!RsAIgG~O_LqC^?)*t4WwN+BFM0K@~o!fsCQCr*S)dh zLk%za;gKSCAkEufE91!vufSmZk|T>1xuC@{k)9?L_Trpbng9p@EdLe%;TaqU{9in8 zV^mMx+tY5Z*}wUJV7gYpC7i{c-kGiD4)mzcd*#YI38>kf8&>$@RFn+1UCy~?3Hcf2 z?J7l(8%&aN>dPD^F~a2Abt`009A6?O^tCX0KaMm-@55COLvHbXH`=XcJ)BD-D@OR1 zK<-S@E;hd-7P8Pd*#BfX;IT2RuR;iGtG|^09wfu5FcJHuy^RgoDH&J!*ujHKdH_Yp z8xc*!CsRqt^zMA!@b;^|wu~!-dV@9VmqU=3g8)o^cQ0Msv`GZyz?;0vdZo8m z>_nF+qlJLl9(xB2uS7*d`~!pM_xWnkk7L0>c)eohG$!U_t_67R1D5NvS)PpE;e;U2 zACgVG0_ng%BynvZb=a$C^?$Q!BmqZ*$$_MB*T9cwCVh93Ol{vZVOK2wO!%m3@Kwdvyn`dt{tkZzo^7 z2-m1T{FE=9aS*i0A93EdRFtu<^W3k^D9_MZ>SRqe)#vMz)vAjs1-!OzF{TN6G*xwf zNvhRJob+-xL-6ePrR9W}x7}+&#NvkzJP5vMy>!`57dMsW@7>sDz9{Hn^7!avVr(=MsT$D=4c?-~Ih|o#NmKL4CMxX<1?J7YJCN$CK2Je0<@=J+C8%bxPao z1MnbI-SU#GxHKk>+ciE9U}DEB6ilZq46$4xR{r|ut}hpCeme9WL_R6%xoylP25m6m zBDtVmvk-CjTa~nRIWJg|Fifb~iL&d6m)5Vod)dA7-JQ~5iI-1~%d|W?Jn37#h{<)% z=?QwYhTa!VS-(B4VLIcs&E^rU1#C{kzTqC{(TGPfCpFzyNay5o!T=pY9Xsg36#j(G z=yp;1C-tSg5ly#(oYb~Ky5UO+R)oVElP#quDfsY*bEGp$C(lm=ycX^x4Pp+)3y{c^Y)$+%_}UR2$^+>4)^2zgFk&W73 zwWAfna@8>tNh5+E?#qO1)9PZ=U z8olG3kBmL*_dhsr8p=_}j*=FQ*9M&5BKK`gWfN_0;itsQ?wLfM zg8FDdb-Nz6_@-1)8_8e{QCI27pvxmdxnouz4sq@G*Q6Q5N`|Ce2EX)DsNkTsFoxw7 zn1?H`zbVda{f>vY=5an$KzI;DX;G5k4UP*PyEMPKu(QbLaT3CoZtB8Dy=r_-_{ zjIsh#&YGtiamH4ws1)bljGWdl)p>vk*V})LH%nyMl+&7Zedi?i!m}`4MOtsBkB7E5 z&5>O`#ddD&*7+sxdK^D5L@5|-Ar5R=cmu@j9WGP(LzUsqIy)Ce1z6aiu9NbiaN1kP zDrFrb$~5Pv@n=|&-Ly??+0gK&@l&BV$Ls(z%L5MnUiK+DkU7}KvH&jjsu1!_*UR~tleF1Sq)Q;x8l2fQS3HXDjHdXJnhFvnyl zqCU^-+Y<3ssQluYxir1VtWl~>Lv*~iv8N`?Y94p*@){o3Pb))O^l0jG$WpGpoG4)q zrz&+MJA$ctbqJ`6+*4aL`I-m|7HcS$YfYV;OQ>g2pbA!sl14m-nKa&*v51wrzu7oR zE3^9LXod|es~&&`v&*!o=ENdzHe&QCb>$dtuu0s^JHVxkRV=B-ynwvEO*WG=EK0q| ze@P$HnH|THY_ihgv3dk~4ihea*KGR~~N6s1b6vY3jE&g)(> z#n*&8^-)KrSJr@%ZrX(g2a^Dd|Va)Guy74NulvgpuWmVm??Y zir7#WK*$9Ks2`7<5u$=hVf0U{lWe$iNwvFVh~Iwb^r&CBi$vvB$Iq-A^C2p_7xAvX zsCssyp&Eo&WB8trj=A=%nT6bovGJQo7wi0U_&)Vln@L`4D&SRUvXY7JZ~{C6>=nIR zb-Vl%r$=S((!<1BShKr8y67+2Qw8!QV!M0y9*tR;$h&i0X|Zl1FlTW__3W=(23q0v zccfIDRn~Oz3{ry19B1yI!OYG276KrfhJv6~0@dcj`d8zfG}e(pDtSEgGke40m>HQPSS384ar2kZZ$n|XI7kK znr_ralMf#S(QJY~2Ii>;brXD7(+S-RMDFZGwy)t5D4i`n!>7K5fafb$VqoSHKd}pJS9wfE2EnL( z&dxGn#?yKGr^n0O`p$dB^_b1Vgnr6rr}BREJoD0Ru1_N{Z|Em((pa3`W>9R%-5!0^ z(r66kG^O(Qp6)U9127_41m;lgnV_OBQq8G`!*MHCeiPyV!{lTl;&HZ@IkJdE*3m6* z(u(tpOhmC`lKcnbDA;Wy1}W}roX9A7h3ApFj^@wLPE%aZOn$ucz>iHePK{h1kNhM} zIq+hAYWMX5Bb|hH{Jl1tCf}8%CDFit!AEn$T$e~D^~TRk-mTAOjqEqob59%|N|jRy zAIfjR`Zs4pX}lME9KUOHLxdT*kS&H6sf&W@v&4Ncv0&uqRQW;sUHd|HqgF?sa~bG~ zdeVA;G*XVpiYK=_rB(aJp;2c;e?{7(<-pn0;WLOMrrL<*5iu2P3ovWOI%Xnxm=wL0 zL%!)fc88-uk2-toH`32G#=6X56tBG&X0eT03uk5*t$VLmx^q6y-gi&S;iSR!ePs{E zV$bh5=3@!LR7a!?4-IikCs=jOu_%Oi5MlhX&BYb$mZzizpxeKGCYdONC1r}Dw~1;T zQkuMl*~-d<&K#kxX71=NSmcW8@7#)MnW)bv>Kb3>^Cl!d+I(+3kesVBeb2Vlkv@p= zdtR&XL!nu#l7LrO1&61eTM zW-6F`Ks3=onj;DCh?XCSHpq#?4_5?qg`&>UXlK4D3oN>7=jbcVs%1 z!3b85Jhzh8n!uLPUf>8J^mZA@Ybw;DWU;k<^N>PDv7Ey&vGOBpeyBKOUu8{GP~DSx z%+vF?p|W6F2xyg(PK4QtL18qE*Uy(DBI*q8qGjD98s+NcPRBr|(~%n7mDPC-yLUmq z5v@1|!77%Nh)q>4seboJtS^u}BuamR=A{jpT>W|11#ALY)b9ay@I9Oi!+phFxV|r4 z7PlrFO$D@kw%%CPkaWpAoM@qA3B^3aSdj}66;KyP)dMzy`~X4vX55_N$9PAM2PGbq9-h0&{^7(@u-UYX!C+z5(yr9-Qog zNd&+7dO^vbUb}noxGya&ELPXGem!=x4HqXVz)zUXID*80rvT_zbQZnenI z-2I7S#-?UU@w(IE%_{3 z^F|N7eEhjsJD8ROBmH(~zl(6vQd)Fs$bouAtlE=K6u=7KK9cO{^sZiQF!@$< zf6 z-2Nu6+GV2dD|!;RBrY+86%H%jSQh*Zpp@ai2a6N+pB^~cx0Aoii+ z3jVF#ahs&)?gDgN(*#Wlau4PMl8JrNX<2;L7Rt5*gUrtNd@O7V*GK8|pG}l_qi1zQ z?k~GFvr7TXImjN}za^ve$xbgZ1R5(j<$La;&KT2RmQ(QdPR@WXrZDDpyV8vO#_;{y zp3EQNOV-<=1r6QjLwgN0t0uyX@q1ZBI-AAf=55?>$)4xVS;vi0e!*bnvTag-aFycR z^{I=>)b;5gA>8rOxLSr6LMswWLt;Kn+Am$4?)wwo`L^?M%kpb_(#@n@r~cH@;^ys^ zkGcs!Tp{0}Lt1osKW2O0-}}&>1GyDSrzVEA4|$<%r-CRJVCwo(#aCUm`5?S_3I6)^ z;rrtkb>oqDZv&L1&%eNr@1Grt8$4n;NLfC(5`oiRiN&Nc%^UZ*%%By`WtugZ@+++R}Ug6p#cF4L8C!y}&_e%N!-# zSwAE-py!nDHd6GXENk420v|=XtA%ru!$;*L7%e#M?A5wbtt>E8wg z_XR3R`cLVEV}>DE{-~6HtAW4t02_eyUnT%h66TFFh4GU9Q?C3;!j;3A!QVgt+*c6q zzfHoQ1-3sT;XfAw066|}T4-1;*oX`O{ikR6`|KzpW^gRs-y3nE SP>TONXpV>l0)qd!+ Date: Wed, 8 Nov 2023 05:22:09 +0530 Subject: [PATCH 2/8] Refactor ABOUT file mapping in d2d for efficiency Reference: #1004 Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/models.py | 7 +- scanpipe/pipes/d2d.py | 178 ++++++++++++++++++++++++++---------------- 2 files changed, 116 insertions(+), 69 deletions(-) diff --git a/scanpipe/models.py b/scanpipe/models.py index d8aa9ba505..92d6a530ba 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -1779,14 +1779,17 @@ def path_pattern(self, pattern): """Resources with a path that match the provided ``pattern``.""" return self.filter(path__regex=posix_regex_to_django_regex_lookup(pattern)) - def path_patterns(self, patterns): + def path_patterns(self, patterns, ignore=False): """Resources with a path that match the provided ``pattern``.""" lookups = Q() for resource_pattern in patterns: lookups |= Q( **{"path__regex": posix_regex_to_django_regex_lookup(resource_pattern)} ) - return self.filter(~lookups) + if ignore: + return self.filter(~lookups) + else: + return self.filter(lookups) def has_directory_content_fingerprint(self): """ diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 5a03a8712b..2ccdf814b3 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -24,6 +24,7 @@ from collections import defaultdict from contextlib import suppress from pathlib import Path +from re import match as regex_match from django.contrib.postgres.aggregates.general import ArrayAgg from django.core.exceptions import MultipleObjectsReturned @@ -43,6 +44,7 @@ from scanpipe import pipes from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource +from scanpipe.models import posix_regex_to_django_regex_lookup from scanpipe.pipes import LoopProgress from scanpipe.pipes import flag from scanpipe.pipes import get_resource_diff_ratio @@ -774,87 +776,129 @@ def _map_javascript_resource( resource.update(status=flag.MAPPED) -def _map_about_file_resource(project, about_file_resource, to_resources): - about_file_location = str(about_file_resource.location_path) - package_data = resolve.resolve_about_package(about_file_location) - - error_message_details = { - "path": about_file_resource.path, - "package_data": package_data, - } - if not package_data: - project.add_error( - description="Cannot create package from ABOUT file", - model="map_about_files", - details=error_message_details, - ) +def map_about_files(project, logger=None): + """Map ``from/`` .ABOUT files to their related ``to/`` resources.""" + project_resources = project.codebaseresources + from_about_files = ( + project_resources.files().from_codebase().filter(extension=".ABOUT") + ) + if not from_about_files.exists(): return - files_pattern = package_data.get("filename") - if not files_pattern: - # Cannot map anything without the about_resource value. - project.add_error( - description="ABOUT file does not have about_resource", - model="map_about_files", - details=error_message_details, - ) - return + to_resources = project_resources.to_codebase().no_status() - ignored_resource_patterns = [] - codebase_resources = to_resources.path_pattern(pattern=files_pattern) - if extra_data := package_data.get("extra_data"): - ignored_resource_patterns = extra_data.get("ignored_resources") - - # Fetch all resources that are covered by the .ABOUT file. - if not codebase_resources: - # If there's nothing to map on the ``to/`` do not create the package. - project.add_warning( - description=( - "Resource paths listed at about_resource is not found" - " in the to/ codebase" - ), - model="map_about_files", - details=error_message_details, + if logger: + logger( + f"Mapping {from_about_files.count():,d} .ABOUT files found in the from/ " + f"codebase." ) - return - # Ignore resources for paths in `ignored_resources` attribute - if ignored_resource_patterns: - codebase_resources = codebase_resources.path_patterns( - patterns=ignored_resource_patterns + regex_by_about_path = {} + ignore_regex_by_about_path = {} + about_resources_by_path = {} + about_pkgdata_by_path = {} + mapped_resources_by_aboutpath = {} + + for about_file_resource in from_about_files: + package_data = resolve.resolve_about_package( + input_location=str(about_file_resource.location_path) ) + error_message_details = { + "path": about_file_resource.path, + "package_data": package_data, + } + if not package_data: + project.add_error( + description="Cannot create package from ABOUT file", + model="map_about_files", + details=error_message_details, + ) + continue - # Create the Package using .ABOUT data and assigned related codebase_resources - pipes.update_or_create_package(project, package_data, codebase_resources) + about_pkgdata_by_path[about_file_resource.path] = package_data + files_pattern = package_data.get("filename") + if not files_pattern: + # Cannot map anything without the about_resource value. + project.add_error( + description="ABOUT file does not have about_resource", + model="map_about_files", + details=error_message_details, + ) + continue + else: + regex = posix_regex_to_django_regex_lookup(files_pattern) + regex_by_about_path[about_file_resource.path] = regex + + if extra_data := package_data.get("extra_data"): + ignore_regex = [] + for pattern in extra_data.get("ignored_resources", []): + ignore_regex.append(posix_regex_to_django_regex_lookup(pattern)) + if ignore_regex: + ignore_regex_by_about_path[about_file_resource.path] = ignore_regex + + about_resources_by_path[about_file_resource.path] = about_file_resource + mapped_resources_by_aboutpath[about_file_resource.path] = [] + + for to_resource in to_resources: + resource_matched = False + for about_path, regex_pattern in regex_by_about_path.items(): + if regex_match(pattern=regex_pattern, string=to_resource.path): + resource_matched = True + break + + if not resource_matched: + continue - # Map the .ABOUT file resource to all related resources in the ``to/`` side. - for to_resource in codebase_resources: - pipes.make_relation( - from_resource=about_file_resource, - to_resource=to_resource, - map_type="about_file", - ) + ignore_regex_patterns = ignore_regex_by_about_path.get(about_path, []) + ignore_resource = False + for ignore_regex_pattern in ignore_regex_patterns: + if regex_match(pattern=ignore_regex_pattern, string=to_resource.path): + ignore_resource = True + break - codebase_resources.update(status=flag.ABOUT_MAPPED) - about_file_resource.update(status=flag.ABOUT_MAPPED) + if ignore_resource: + continue + mapped_resources_about = mapped_resources_by_aboutpath.get(about_path) + if mapped_resources_about: + mapped_resources_about.append(to_resource) + else: + mapped_resources_by_aboutpath[about_path] = [to_resource] + to_resource.update(status=flag.ABOUT_MAPPED) + + for about_path, mapped_resources in mapped_resources_by_aboutpath.items(): + about_file_resource = about_resources_by_path[about_path] + package_data = about_pkgdata_by_path[about_file_resource.path] + + if not mapped_resources: + error_message_details = { + "path": about_file_resource.path, + "package_data": package_data, + } + project.add_warning( + description=( + "Resource paths listed at about_resource is not found" + " in the to/ codebase" + ), + model="map_about_files", + details=error_message_details, + ) + continue -def map_about_files(project, logger=None): - """Map ``from/`` .ABOUT files to their related ``to/`` resources.""" - project_resources = project.codebaseresources - from_files = project_resources.files().from_codebase() - from_about_files = from_files.filter(extension=".ABOUT") - to_resources = project_resources.to_codebase() + # Create the Package using .ABOUT data and assigned related codebase_resources + pipes.update_or_create_package(project, package_data, mapped_resources) - if logger: - logger( - f"Mapping {from_about_files.count():,d} .ABOUT files found in the from/ " - f"codebase." - ) + # Map the .ABOUT file resource to all related resources in the ``to/`` side. + for mapped_resource in mapped_resources: + pipes.make_relation( + from_resource=about_file_resource, + to_resource=mapped_resource, + map_type="about_file", + ) - for about_file_resource in from_about_files: - _map_about_file_resource(project, about_file_resource, to_resources) + about_file_resource.update(status=flag.ABOUT_MAPPED) + for about_file_resource in about_resources_by_path.values(): about_file_companions = ( about_file_resource.siblings() .filter(name__startswith=about_file_resource.name_without_extension) From a0c03b2ae5a3716e42a338c63935d9b76a13b31e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Mon, 15 Jan 2024 15:39:02 +0530 Subject: [PATCH 3/8] Restructure map_about_files Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/pipes/d2d.py | 253 +++++++++++++++++++++++------------------- 1 file changed, 141 insertions(+), 112 deletions(-) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index d73f1f6dd9..8295a48dab 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -25,6 +25,7 @@ from contextlib import suppress from pathlib import Path from re import match as regex_match +from typing import NamedTuple from django.contrib.postgres.aggregates.general import ArrayAgg from django.core.exceptions import MultipleObjectsReturned @@ -776,6 +777,140 @@ def _map_javascript_resource( resource.update(status=flag.MAPPED) +class AboutFileIndexes(NamedTuple): + regex_by_about_path: dict + ignore_regex_by_about_path: dict + about_resources_by_path: dict + about_pkgdata_by_path: dict + mapped_resources_by_aboutpath: dict + + @classmethod + def create_indexes(cls, project, from_about_files): + """Return an ABOUT file index or None.""" + about_pkgdata_by_path = {} + regex_by_about_path = {} + ignore_regex_by_about_path = {} + about_resources_by_path = {} + mapped_resources_by_aboutpath = {} + + for about_file_resource in from_about_files: + package_data = resolve.resolve_about_package( + input_location=str(about_file_resource.location_path) + ) + error_message_details = { + "path": about_file_resource.path, + "package_data": package_data, + } + if not package_data: + project.add_error( + description="Cannot create package from ABOUT file", + model="map_about_files", + details=error_message_details, + ) + continue + + about_pkgdata_by_path[about_file_resource.path] = package_data + files_pattern = package_data.get("filename") + if not files_pattern: + # Cannot map anything without the about_resource value. + project.add_error( + description="ABOUT file does not have about_resource", + model="map_about_files", + details=error_message_details, + ) + continue + else: + regex = posix_regex_to_django_regex_lookup(files_pattern) + regex_by_about_path[about_file_resource.path] = regex + + if extra_data := package_data.get("extra_data"): + ignore_regex = [] + for pattern in extra_data.get("ignored_resources", []): + ignore_regex.append(posix_regex_to_django_regex_lookup(pattern)) + if ignore_regex: + ignore_regex_by_about_path[about_file_resource.path] = ignore_regex + + about_resources_by_path[about_file_resource.path] = about_file_resource + mapped_resources_by_aboutpath[about_file_resource.path] = [] + + return cls( + about_pkgdata_by_path=about_pkgdata_by_path, + regex_by_about_path=regex_by_about_path, + ignore_regex_by_about_path=ignore_regex_by_about_path, + about_resources_by_path=about_resources_by_path, + mapped_resources_by_aboutpath=mapped_resources_by_aboutpath, + ) + + def match_to_resources(self, to_resources): + for to_resource in to_resources: + resource_matched = False + for about_path, regex_pattern in self.regex_by_about_path.items(): + if regex_match(pattern=regex_pattern, string=to_resource.path): + resource_matched = True + break + + if not resource_matched: + continue + + ignore_regex_patterns = self.ignore_regex_by_about_path.get(about_path, []) + ignore_resource = False + for ignore_regex_pattern in ignore_regex_patterns: + if regex_match(pattern=ignore_regex_pattern, string=to_resource.path): + ignore_resource = True + break + + if ignore_resource: + continue + + mapped_resources_about = self.mapped_resources_by_aboutpath.get(about_path) + if mapped_resources_about: + mapped_resources_about.append(to_resource) + else: + self.mapped_resources_by_aboutpath[about_path] = [to_resource] + to_resource.update(status=flag.ABOUT_MAPPED) + + def create_about_packages_relations(self, project): + for about_path, mapped_resources in self.mapped_resources_by_aboutpath.items(): + about_file_resource = self.about_resources_by_path[about_path] + package_data = self.about_pkgdata_by_path[about_file_resource.path] + + if not mapped_resources: + error_message_details = { + "path": about_file_resource.path, + "package_data": package_data, + } + project.add_warning( + description=( + "Resource paths listed at about_resource is not found" + " in the to/ codebase" + ), + model="map_about_files", + details=error_message_details, + ) + continue + + # Create the Package using .ABOUT data and assign related codebase_resources + pipes.update_or_create_package(project, package_data, mapped_resources) + + # Map the .ABOUT file resource to all related resources in the ``to/`` side. + for mapped_resource in mapped_resources: + pipes.make_relation( + from_resource=about_file_resource, + to_resource=mapped_resource, + map_type="about_file", + ) + + about_file_resource.update(status=flag.ABOUT_MAPPED) + + for about_file_resource in self.about_resources_by_path.values(): + about_file_companions = ( + about_file_resource.siblings() + .filter(name__startswith=about_file_resource.name_without_extension) + .filter(extension__in=[".LICENSE", ".NOTICE"]) + ) + about_file_companions.update(status=flag.ABOUT_MAPPED) + + def map_about_files(project, logger=None): """Map ``from/`` .ABOUT files to their related ``to/`` resources.""" project_resources = project.codebaseresources @@ -785,6 +920,10 @@ def map_about_files(project, logger=None): if not from_about_files.exists(): return + indexes = AboutFileIndexes.create_indexes( + project=project, from_about_files=from_about_files + ) + to_resources = project_resources.to_codebase().no_status() if logger: @@ -793,118 +932,8 @@ def map_about_files(project, logger=None): f"codebase." ) - regex_by_about_path = {} - ignore_regex_by_about_path = {} - about_resources_by_path = {} - about_pkgdata_by_path = {} - mapped_resources_by_aboutpath = {} - - for about_file_resource in from_about_files: - package_data = resolve.resolve_about_package( - input_location=str(about_file_resource.location_path) - ) - error_message_details = { - "path": about_file_resource.path, - "package_data": package_data, - } - if not package_data: - project.add_error( - description="Cannot create package from ABOUT file", - model="map_about_files", - details=error_message_details, - ) - continue - - about_pkgdata_by_path[about_file_resource.path] = package_data - files_pattern = package_data.get("filename") - if not files_pattern: - # Cannot map anything without the about_resource value. - project.add_error( - description="ABOUT file does not have about_resource", - model="map_about_files", - details=error_message_details, - ) - continue - else: - regex = posix_regex_to_django_regex_lookup(files_pattern) - regex_by_about_path[about_file_resource.path] = regex - - if extra_data := package_data.get("extra_data"): - ignore_regex = [] - for pattern in extra_data.get("ignored_resources", []): - ignore_regex.append(posix_regex_to_django_regex_lookup(pattern)) - if ignore_regex: - ignore_regex_by_about_path[about_file_resource.path] = ignore_regex - - about_resources_by_path[about_file_resource.path] = about_file_resource - mapped_resources_by_aboutpath[about_file_resource.path] = [] - - for to_resource in to_resources: - resource_matched = False - for about_path, regex_pattern in regex_by_about_path.items(): - if regex_match(pattern=regex_pattern, string=to_resource.path): - resource_matched = True - break - - if not resource_matched: - continue - - ignore_regex_patterns = ignore_regex_by_about_path.get(about_path, []) - ignore_resource = False - for ignore_regex_pattern in ignore_regex_patterns: - if regex_match(pattern=ignore_regex_pattern, string=to_resource.path): - ignore_resource = True - break - - if ignore_resource: - continue - - mapped_resources_about = mapped_resources_by_aboutpath.get(about_path) - if mapped_resources_about: - mapped_resources_about.append(to_resource) - else: - mapped_resources_by_aboutpath[about_path] = [to_resource] - to_resource.update(status=flag.ABOUT_MAPPED) - - for about_path, mapped_resources in mapped_resources_by_aboutpath.items(): - about_file_resource = about_resources_by_path[about_path] - package_data = about_pkgdata_by_path[about_file_resource.path] - - if not mapped_resources: - error_message_details = { - "path": about_file_resource.path, - "package_data": package_data, - } - project.add_warning( - description=( - "Resource paths listed at about_resource is not found" - " in the to/ codebase" - ), - model="map_about_files", - details=error_message_details, - ) - continue - - # Create the Package using .ABOUT data and assigned related codebase_resources - pipes.update_or_create_package(project, package_data, mapped_resources) - - # Map the .ABOUT file resource to all related resources in the ``to/`` side. - for mapped_resource in mapped_resources: - pipes.make_relation( - from_resource=about_file_resource, - to_resource=mapped_resource, - map_type="about_file", - ) - - about_file_resource.update(status=flag.ABOUT_MAPPED) - - for about_file_resource in about_resources_by_path.values(): - about_file_companions = ( - about_file_resource.siblings() - .filter(name__startswith=about_file_resource.name_without_extension) - .filter(extension__in=[".LICENSE", ".NOTICE"]) - ) - about_file_companions.update(status=flag.ABOUT_MAPPED) + indexes.match_to_resources(to_resources) + indexes.create_about_packages_relations(project) def map_javascript_post_purldb_match(project, logger=None): From b9e583d42d66b2ea8adb7ca4bb1cd714f936f68e Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Tue, 16 Jan 2024 17:42:06 +0530 Subject: [PATCH 4/8] Address feedback and review comments Reference: https://github.com/nexB/scancode.io/pull/982 Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/models.py | 20 +---- scanpipe/pipes/d2d.py | 135 ++++++++++++++++++++++++++-------- scanpipe/tests/test_models.py | 6 +- 3 files changed, 113 insertions(+), 48 deletions(-) diff --git a/scanpipe/models.py b/scanpipe/models.py index 749cfdf6c9..55bfc52c9c 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -1793,12 +1793,12 @@ def profile(self, print_results=False): print(output_str) -def posix_regex_to_django_regex_lookup(regex_pattern): +def convert_glob_to_django_regex(glob_pattern): """ Convert a POSIX-style regex pattern to an equivalent pattern compatible with the Django regex lookup. """ - escaped_pattern = re.escape(regex_pattern) + escaped_pattern = re.escape(glob_pattern) escaped_pattern = escaped_pattern.replace(r"\*", ".*") # Replace \* with .* escaped_pattern = escaped_pattern.replace(r"\?", ".") # Replace \? with . escaped_pattern = f"^{escaped_pattern}$" # Add start and end anchors @@ -1906,20 +1906,8 @@ def has_value(self, field_name): return self.filter(~Q((f"{field_name}__in", EMPTY_VALUES))) def path_pattern(self, pattern): - """Resources with a path that match the provided ``pattern``.""" - return self.filter(path__regex=posix_regex_to_django_regex_lookup(pattern)) - - def path_patterns(self, patterns, ignore=False): - """Resources with a path that match the provided ``pattern``.""" - lookups = Q() - for resource_pattern in patterns: - lookups |= Q( - **{"path__regex": posix_regex_to_django_regex_lookup(resource_pattern)} - ) - if ignore: - return self.filter(~lookups) - else: - return self.filter(lookups) + """Resources with a path that match the provided glob ``pattern``.""" + return self.filter(path__regex=convert_glob_to_django_regex(pattern)) def has_directory_content_fingerprint(self): """ diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index 8295a48dab..c72512d00c 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -45,7 +45,7 @@ from scanpipe import pipes from scanpipe.models import CodebaseRelation from scanpipe.models import CodebaseResource -from scanpipe.models import posix_regex_to_django_regex_lookup +from scanpipe.models import convert_glob_to_django_regex from scanpipe.pipes import LoopProgress from scanpipe.pipes import flag from scanpipe.pipes import get_resource_diff_ratio @@ -778,21 +778,35 @@ def _map_javascript_resource( class AboutFileIndexes(NamedTuple): + # Mapping of About file paths and the regex pattern + # string for the files documented regex_by_about_path: dict + # Mapping of About file paths and a list of path pattern + # strings, for the files to be ignored ignore_regex_by_about_path: dict + # Resource objects for About files present in the codebase, + # by their path about_resources_by_path: dict + # mapping of package data present in the About file, by path about_pkgdata_by_path: dict + # List of mapped resources for each About file, by path mapped_resources_by_aboutpath: dict @classmethod - def create_indexes(cls, project, from_about_files): - """Return an ABOUT file index or None.""" + def create_indexes(cls, project, from_about_files, logger=None): + """ + Return an ABOUT file index, containing path pattern mappings, + package data, and resources, created from `from_about_files`, + the About file resources. + """ about_pkgdata_by_path = {} regex_by_about_path = {} ignore_regex_by_about_path = {} about_resources_by_path = {} mapped_resources_by_aboutpath = {} + count_indexed_about_files = 0 + for about_file_resource in from_about_files: package_data = resolve.resolve_about_package( input_location=str(about_file_resource.location_path) @@ -820,19 +834,26 @@ def create_indexes(cls, project, from_about_files): ) continue else: - regex = posix_regex_to_django_regex_lookup(files_pattern) + count_indexed_about_files += 1 + regex = convert_glob_to_django_regex(files_pattern) regex_by_about_path[about_file_resource.path] = regex if extra_data := package_data.get("extra_data"): ignore_regex = [] for pattern in extra_data.get("ignored_resources", []): - ignore_regex.append(posix_regex_to_django_regex_lookup(pattern)) + ignore_regex.append(convert_glob_to_django_regex(pattern)) if ignore_regex: ignore_regex_by_about_path[about_file_resource.path] = ignore_regex about_resources_by_path[about_file_resource.path] = about_file_resource mapped_resources_by_aboutpath[about_file_resource.path] = [] + if logger: + logger( + f"Created mapping index from {count_indexed_about_files:,d} .ABOUT " + f"files in the from/ codebase." + ) + return cls( about_pkgdata_by_path=about_pkgdata_by_path, regex_by_about_path=regex_by_about_path, @@ -841,25 +862,46 @@ def create_indexes(cls, project, from_about_files): mapped_resources_by_aboutpath=mapped_resources_by_aboutpath, ) - def match_to_resources(self, to_resources): - for to_resource in to_resources: - resource_matched = False - for about_path, regex_pattern in self.regex_by_about_path.items(): - if regex_match(pattern=regex_pattern, string=to_resource.path): - resource_matched = True - break - - if not resource_matched: - continue - - ignore_regex_patterns = self.ignore_regex_by_about_path.get(about_path, []) - ignore_resource = False - for ignore_regex_pattern in ignore_regex_patterns: - if regex_match(pattern=ignore_regex_pattern, string=to_resource.path): - ignore_resource = True - break + def get_matched_about_path(self, to_resource): + """ + Map `to_resource` using the about file index, and if + mapped, return the path string to the About file it + was mapped to, and if not mapped or ignored, return + None. + """ + resource_mapped = False + for about_path, regex_pattern in self.regex_by_about_path.items(): + if regex_match(pattern=regex_pattern, string=to_resource.path): + resource_mapped = True + break + + if not resource_mapped: + return + + ignore_regex_patterns = self.ignore_regex_by_about_path.get(about_path, []) + ignore_resource = False + for ignore_regex_pattern in ignore_regex_patterns: + if regex_match(pattern=ignore_regex_pattern, string=to_resource.path): + ignore_resource = True + break + + if ignore_resource: + return + + return about_path + + def map_deployed_to_devel_using_about(self, to_resources): + """ + Return mapped resources which are mapped using the + path patterns in About file indexes. Resources are + mapped for each About file in the index, and + their status is updated accordingly. + """ + mapped_to_resources = [] - if ignore_resource: + for to_resource in to_resources: + about_path = self.get_matched_about_path(to_resource) + if not about_path: continue mapped_resources_about = self.mapped_resources_by_aboutpath.get(about_path) @@ -867,9 +909,20 @@ def match_to_resources(self, to_resources): mapped_resources_about.append(to_resource) else: self.mapped_resources_by_aboutpath[about_path] = [to_resource] + mapped_to_resources.append(to_resource) to_resource.update(status=flag.ABOUT_MAPPED) + return mapped_to_resources + def create_about_packages_relations(self, project): + """ + Create packages using About file package data, if the About file + has mapped resources on the to/ codebase and creates the mappings + for the package created and mapped resources. + """ + about_purls = set() + mapped_about_resources = [] + for about_path, mapped_resources in self.mapped_resources_by_aboutpath.items(): about_file_resource = self.about_resources_by_path[about_path] package_data = self.about_pkgdata_by_path[about_file_resource.path] @@ -890,7 +943,13 @@ def create_about_packages_relations(self, project): continue # Create the Package using .ABOUT data and assign related codebase_resources - pipes.update_or_create_package(project, package_data, mapped_resources) + about_package = pipes.update_or_create_package( + project=project, + package_data=package_data, + codebase_resources=mapped_resources, + ) + about_purls.add(about_package.purl) + mapped_about_resources.append(about_file_resource) # Map the .ABOUT file resource to all related resources in the ``to/`` side. for mapped_resource in mapped_resources: @@ -910,6 +969,8 @@ def create_about_packages_relations(self, project): ) about_file_companions.update(status=flag.ABOUT_MAPPED) + return about_purls, mapped_about_resources + def map_about_files(project, logger=None): """Map ``from/`` .ABOUT files to their related ``to/`` resources.""" @@ -920,20 +981,36 @@ def map_about_files(project, logger=None): if not from_about_files.exists(): return + if logger: + logger( + f"Mapping {from_about_files.count():,d} .ABOUT files found in the from/ " + f"codebase." + ) + indexes = AboutFileIndexes.create_indexes( project=project, from_about_files=from_about_files ) + # Ignoring empty or ignored files as they are not relevant anyway to_resources = project_resources.to_codebase().no_status() - + mapped_to_resources = indexes.map_deployed_to_devel_using_about( + to_resources=to_resources, + ) if logger: logger( - f"Mapping {from_about_files.count():,d} .ABOUT files found in the from/ " - f"codebase." + f"Mapped {len(mapped_to_resources):,d} resources from the " + f"to/ codebase to the About files in the from. codebase." ) - indexes.match_to_resources(to_resources) - indexes.create_about_packages_relations(project) + about_purls, mapped_about_resources = indexes.create_about_packages_relations( + project=project, + ) + if logger: + logger( + f"Created {len(about_purls):,d} new packages from " + f"{len(mapped_about_resources):,d} About files which " + f"were mapped to resources in the to/ side." + ) def map_javascript_post_purldb_match(project, logger=None): diff --git a/scanpipe/tests/test_models.py b/scanpipe/tests/test_models.py index 2b81f06270..b2358acb46 100644 --- a/scanpipe/tests/test_models.py +++ b/scanpipe/tests/test_models.py @@ -61,8 +61,8 @@ from scanpipe.models import RunInProgressError from scanpipe.models import RunNotAllowedToStart from scanpipe.models import UUIDTaggedItem +from scanpipe.models import convert_glob_to_django_regex from scanpipe.models import get_project_work_directory -from scanpipe.models import posix_regex_to_django_regex_lookup from scanpipe.pipes.fetch import Download from scanpipe.pipes.input import copy_input from scanpipe.tests import dependency_data1 @@ -683,7 +683,7 @@ def test_scanpipe_model_update_mixin(self): package.refresh_from_db() self.assertEqual("pkg:deb/debian/adduser@3.118?arch=all", package.package_url) - def test_scanpipe_model_posix_regex_to_django_regex_lookup(self): + def test_scanpipe_model_convert_glob_to_django_regex(self): test_data = [ ("", r"^$"), # Single segment @@ -715,7 +715,7 @@ def test_scanpipe_model_posix_regex_to_django_regex_lookup(self): ] for pattern, expected in test_data: - self.assertEqual(expected, posix_regex_to_django_regex_lookup(pattern)) + self.assertEqual(expected, convert_glob_to_django_regex(pattern)) def test_scanpipe_run_model_set_scancodeio_version(self): run1 = Run.objects.create(project=self.project1) From b1393598d1ee01baaeceda7136d58956a7e182f0 Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 17 Jan 2024 01:55:17 +0530 Subject: [PATCH 5/8] Update docstrings and use dataclass Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/models.py | 4 ++-- scanpipe/pipes/d2d.py | 11 +++++++++-- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/scanpipe/models.py b/scanpipe/models.py index 55bfc52c9c..8ee6970beb 100644 --- a/scanpipe/models.py +++ b/scanpipe/models.py @@ -1795,8 +1795,8 @@ def profile(self, print_results=False): def convert_glob_to_django_regex(glob_pattern): """ - Convert a POSIX-style regex pattern to an equivalent pattern compatible with the - Django regex lookup. + Convert a glob pattern to an equivalent django regex pattern + compatible with the Django regex lookup. """ escaped_pattern = re.escape(glob_pattern) escaped_pattern = escaped_pattern.replace(r"\*", ".*") # Replace \* with .* diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index c72512d00c..ca54b60fdb 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -23,9 +23,9 @@ from collections import Counter from collections import defaultdict from contextlib import suppress +from dataclasses import dataclass from pathlib import Path from re import match as regex_match -from typing import NamedTuple from django.contrib.postgres.aggregates.general import ArrayAgg from django.core.exceptions import MultipleObjectsReturned @@ -777,7 +777,14 @@ def _map_javascript_resource( resource.update(status=flag.MAPPED) -class AboutFileIndexes(NamedTuple): +@dataclass +class AboutFileIndexes: + """ + Stores the regex path patterns, and ignore patterns, + package data and about file/mapped resources to create + packages from About files and map to deployed resources. + """ + # Mapping of About file paths and the regex pattern # string for the files documented regex_by_about_path: dict From ae025a2b06f24f523ca0da52082dde45095489aa Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Wed, 17 Jan 2024 02:42:35 +0530 Subject: [PATCH 6/8] Use license/notice files from About data Reference: https://github.com/nexB/scancode.io/issues/1004 Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/pipes/d2d.py | 31 ++++++++++++++----- scanpipe/pipes/resolve.py | 28 ++++++++++++----- .../tests/data/d2d/about_files/expected.json | 4 ++- scanpipe/tests/pipes/test_resolve.py | 6 +++- 4 files changed, 52 insertions(+), 17 deletions(-) diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index ca54b60fdb..b9874703ac 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -921,6 +921,26 @@ def map_deployed_to_devel_using_about(self, to_resources): return mapped_to_resources + def get_about_file_companions(self, about_path): + """ + Given an ``about_path`` path string to an About file, + get CodebaseResource objects for the companion license + and notice files. + """ + about_file_resource = self.about_resources_by_path.get(about_path) + about_file_extra_data = self.about_pkgdata_by_path.get(about_path).get( + "extra_data" + ) + + about_file_companion_names = [ + about_file_extra_data.get("license_file"), + about_file_extra_data.get("notice_file"), + ] + about_file_companions = about_file_resource.siblings().filter( + name__in=about_file_companion_names + ) + return about_file_companions + def create_about_packages_relations(self, project): """ Create packages using About file package data, if the About file @@ -932,11 +952,11 @@ def create_about_packages_relations(self, project): for about_path, mapped_resources in self.mapped_resources_by_aboutpath.items(): about_file_resource = self.about_resources_by_path[about_path] - package_data = self.about_pkgdata_by_path[about_file_resource.path] + package_data = self.about_pkgdata_by_path[about_path] if not mapped_resources: error_message_details = { - "path": about_file_resource.path, + "path": about_path, "package_data": package_data, } project.add_warning( @@ -968,12 +988,7 @@ def create_about_packages_relations(self, project): about_file_resource.update(status=flag.ABOUT_MAPPED) - for about_file_resource in self.about_resources_by_path.values(): - about_file_companions = ( - about_file_resource.siblings() - .filter(name__startswith=about_file_resource.name_without_extension) - .filter(extension__in=[".LICENSE", ".NOTICE"]) - ) + about_file_companions = self.get_about_file_companions(about_path) about_file_companions.update(status=flag.ABOUT_MAPPED) return about_purls, mapped_about_resources diff --git a/scanpipe/pipes/resolve.py b/scanpipe/pipes/resolve.py index 25b343d9b8..7f99afadb5 100644 --- a/scanpipe/pipes/resolve.py +++ b/scanpipe/pipes/resolve.py @@ -86,18 +86,15 @@ def resolve_about_package(input_location): if value: package_data[field_name] = value + package_data["extra_data"] = {} + if about_resource := about_data.get("about_resource"): package_data["filename"] = list(about_resource.keys())[0] if ignored_resources := about_data.get("ignored_resources"): - extra_data = {"ignored_resources": list(ignored_resources.keys())} - package_data["extra_data"] = extra_data - - if license_expression := about_data.get("license_expression"): - package_data["declared_license_expression"] = license_expression + package_data["extra_data"]["ignored_resources"] = list(ignored_resources.keys()) - if notice_dict := about_data.get("notice_file"): - package_data["notice_text"] = list(notice_dict.values())[0] + populate_license_notice_fields_about(package_data, about_data) for field_name, value in about_data.items(): if field_name.startswith("checksum_"): @@ -107,6 +104,23 @@ def resolve_about_package(input_location): return package_data +def populate_license_notice_fields_about(package_data, about_data): + """ + Populate ``package_data`` with license and notice attributes + from ``about_data``. + """ + if license_expression := about_data.get("license_expression"): + package_data["declared_license_expression"] = license_expression + + if notice_dict := about_data.get("notice_file"): + package_data["notice_text"] = list(notice_dict.values())[0] + package_data["extra_data"]["notice_file"] = list(notice_dict.keys())[0] + + if license_dict := about_data.get("license_file"): + package_data["extra_data"]["license_file"] = list(license_dict.keys())[0] + package_data["extracted_license_statement"] = list(license_dict.values())[0] + + def resolve_about_packages(input_location): """ Wrap ``resolve_about_package`` to return a list as expected by the diff --git a/scanpipe/tests/data/d2d/about_files/expected.json b/scanpipe/tests/data/d2d/about_files/expected.json index f8ebf3db29..8c586170d6 100644 --- a/scanpipe/tests/data/d2d/about_files/expected.json +++ b/scanpipe/tests/data/d2d/about_files/expected.json @@ -111,10 +111,12 @@ "other_license_expression": "", "other_license_expression_spdx": "", "other_license_detections": [], - "extracted_license_statement": "", + "extracted_license_statement": " Apache License\n Version 2.0, January 2004\n http://www.apache.org/licenses/\n\n ", "notice_text": "notice", "source_packages": [], "extra_data": { + "notice_file": "flume-ng-node-1.9.0-sources.NOTICE", + "license_file": "flume-ng-node-1.9.0-sources.LICENSE", "ignored_resources": [ "*flume-ng-node-*.jar-extract/org/apache/flume/node/ConfigurationProvider.class" ] diff --git a/scanpipe/tests/pipes/test_resolve.py b/scanpipe/tests/pipes/test_resolve.py index afe29caab6..f8802cd4d1 100644 --- a/scanpipe/tests/pipes/test_resolve.py +++ b/scanpipe/tests/pipes/test_resolve.py @@ -92,6 +92,8 @@ def test_scanpipe_pipes_resolve_resolve_packages(self): "filename": "Django-4.0.8-py3-none-any.whl", "download_url": "https://python.org/Django-4.0.8-py3-none-any.whl", "declared_license_expression": "bsd-new", + "extra_data": {"license_file": "bsd-new.LICENSE"}, + "extracted_license_statement": None, "md5": "386349753c386e574dceca5067e2788a", "name": "django", "sha1": "4cc6f7abda928a0b12cd1f1cd8ad3677519ca04e", @@ -114,6 +116,8 @@ def test_scanpipe_pipes_resolve_resolve_about_packages(self): "filename": "Django-4.0.8-py3-none-any.whl", "download_url": "https://python.org/Django-4.0.8-py3-none-any.whl", "declared_license_expression": "bsd-new", + "extra_data": {"license_file": "bsd-new.LICENSE"}, + "extracted_license_statement": None, "md5": "386349753c386e574dceca5067e2788a", "name": "django", "sha1": "4cc6f7abda928a0b12cd1f1cd8ad3677519ca04e", @@ -124,7 +128,7 @@ def test_scanpipe_pipes_resolve_resolve_about_packages(self): input_location = self.manifest_location / "poor_values.ABOUT" package = resolve.resolve_about_packages(str(input_location)) - expected = {"name": "project"} + expected = {"extra_data": {}, "name": "project"} self.assertEqual([expected], package) def test_scanpipe_pipes_resolve_spdx_package_to_discovered_package_data(self): From 4268ee82f1641de8f8060165013a72aef38ecc5f Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Fri, 26 Jan 2024 02:50:33 +0530 Subject: [PATCH 7/8] Add tests for AboutFileIndex methods Reference: https://github.com/nexB/scancode.io/pull/982 Signed-off-by: Ayan Sinha Mahapatra --- scanpipe/tests/pipes/test_d2d.py | 121 +++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/scanpipe/tests/pipes/test_d2d.py b/scanpipe/tests/pipes/test_d2d.py index 2ddcdc720f..d3a1fecfa9 100644 --- a/scanpipe/tests/pipes/test_d2d.py +++ b/scanpipe/tests/pipes/test_d2d.py @@ -33,6 +33,7 @@ from scanpipe.models import Project from scanpipe.pipes import d2d from scanpipe.pipes import flag +from scanpipe.pipes import scancode from scanpipe.pipes.input import copy_input from scanpipe.pipes.input import copy_inputs from scanpipe.tests import make_resource_directory @@ -1258,6 +1259,126 @@ def test_scanpipe_pipes_flag_whitespace_files(self): flag.IGNORED_WHITESPACE_FILE, non_whitespace_resource.status ) + def test_scanpipe_pipes_create_about_file_indexes(self): + input_dir = self.project1.input_path + input_resources = [ + self.data_location / "d2d/about_files/to-with-jar.zip", + self.data_location / "d2d/about_files/from-with-about-file.zip", + ] + copy_inputs(input_resources, input_dir) + self.from_files, self.to_files = d2d.get_inputs(self.project1) + + inputs_with_codebase_path_destination = [ + (self.from_files, self.project1.codebase_path / d2d.FROM), + (self.to_files, self.project1.codebase_path / d2d.TO), + ] + + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + scancode.extract_archive(input_file_path, codebase_path) + + scancode.extract_archives( + self.project1.codebase_path, + recurse=True, + ) + + pipes.collect_and_create_codebase_resources(self.project1) + + from_about_files = ( + self.project1.codebaseresources.files() + .from_codebase() + .filter(extension=".ABOUT") + ) + about_file_indexes = d2d.AboutFileIndexes.create_indexes( + project=self.project1, + from_about_files=from_about_files, + ) + + about_path = "from/flume-ng-node-1.9.0-sources.ABOUT" + about_notice_path = "from/flume-ng-node-1.9.0-sources.NOTICE" + + about_notice_file = self.project1.codebaseresources.get(path=about_notice_path) + + self.assertIn( + about_path, list(about_file_indexes.about_resources_by_path.keys()) + ) + about_regex = d2d.convert_glob_to_django_regex( + glob_pattern="*flume-ng-node-*.jar*" + ) + self.assertEqual( + about_file_indexes.regex_by_about_path.get(about_path), about_regex + ) + self.assertEqual( + about_file_indexes.about_pkgdata_by_path.get(about_path).get("name"), + "log4j", + ) + self.assertIn( + about_notice_file, about_file_indexes.get_about_file_companions(about_path) + ) + to_resource = self.project1.codebaseresources.get( + path=( + "to/flume-ng-node-1.9.0.jar-extract/org/apache/" + "flume/node/AbstractZooKeeperConfigurationProvider.class" + ) + ) + self.assertEqual( + about_file_indexes.get_matched_about_path(to_resource), about_path + ) + + def test_scanpipe_pipes_map_d2d_using_about(self): + input_dir = self.project1.input_path + input_resources = [ + self.data_location / "d2d/about_files/to-with-jar.zip", + self.data_location / "d2d/about_files/from-with-about-file.zip", + ] + copy_inputs(input_resources, input_dir) + self.from_files, self.to_files = d2d.get_inputs(self.project1) + + inputs_with_codebase_path_destination = [ + (self.from_files, self.project1.codebase_path / d2d.FROM), + (self.to_files, self.project1.codebase_path / d2d.TO), + ] + + for input_files, codebase_path in inputs_with_codebase_path_destination: + for input_file_path in input_files: + scancode.extract_archive(input_file_path, codebase_path) + + scancode.extract_archives( + self.project1.codebase_path, + recurse=True, + ) + + pipes.collect_and_create_codebase_resources(self.project1) + + from_about_files = ( + self.project1.codebaseresources.files() + .from_codebase() + .filter(extension=".ABOUT") + ) + about_file_indexes = d2d.AboutFileIndexes.create_indexes( + project=self.project1, + from_about_files=from_about_files, + ) + + to_resources = self.project1.codebaseresources.to_codebase() + about_file_indexes.map_deployed_to_devel_using_about( + to_resources=to_resources, + ) + + about_path = "from/flume-ng-node-1.9.0-sources.ABOUT" + to_resource = self.project1.codebaseresources.get( + path=( + "to/flume-ng-node-1.9.0.jar-extract/org/apache/" + "flume/node/AbstractZooKeeperConfigurationProvider.class" + ) + ) + self.assertIn( + to_resource, + about_file_indexes.mapped_resources_by_aboutpath.get(about_path), + ) + + about_file_indexes.create_about_packages_relations(self.project1) + def test_scanpipe_pipes_d2d_match_purldb_resources_post_process(self): to_map = self.data_location / "d2d-javascript" / "to" / "main.js.map" to_mini = self.data_location / "d2d-javascript" / "to" / "main.js" From 070387e74b7db9f1de0b8f0a97ac587b8365249f Mon Sep 17 00:00:00 2001 From: Ayan Sinha Mahapatra Date: Fri, 26 Jan 2024 02:55:35 +0530 Subject: [PATCH 8/8] Address feedback and update CHANGELOG Reference: https://github.com/nexB/scancode.io/pull/982 Signed-off-by: Ayan Sinha Mahapatra --- CHANGELOG.rst | 4 ++++ scanpipe/pipes/d2d.py | 7 ++++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 761e1f787d..2558021309 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -34,6 +34,10 @@ v33.0.0 (unreleased) project pipeline. https://github.com/nexB/scancode.io/issues/997 +- In "map_deploy_to_develop" pipeline, add support for path patterns + in About file attributes documenting resource paths. + https://github.com/nexB/scancode.io/issues/1004 + v32.7.0 (2023-10-25) -------------------- diff --git a/scanpipe/pipes/d2d.py b/scanpipe/pipes/d2d.py index b9874703ac..eea8a9c9fb 100644 --- a/scanpipe/pipes/d2d.py +++ b/scanpipe/pipes/d2d.py @@ -780,9 +780,10 @@ def _map_javascript_resource( @dataclass class AboutFileIndexes: """ - Stores the regex path patterns, and ignore patterns, - package data and about file/mapped resources to create - packages from About files and map to deployed resources. + About file indexes are used to create packages from + About files and map the resources described in them + to the respective packages created, using regex path + patterns and other About file data. """ # Mapping of About file paths and the regex pattern