Merge pull request #49 from Giskard-AI/GSK-2864-fragment-markdown-report

Inokinoki · web-flow · commit 077e23c627f9 · 2024-02-23T18:08:56.000+01:00
Fragment issues and examples if they are too long
diff --git a/giskard_cicd/automation/post_discussion.py b/giskard_cicd/automation/post_discussion.py
@@ -3,8 +3,9 @@
 import markdown
 import re
 from time import sleep
-from .utils import ISSUE_GROUPS
+import logging
 
+logger = logging.getLogger(__file__)  
 GISKARD_HUB_URL = "https://huggingface.co/spaces/giskardai/giskard"
 
 def construct_opening(dataset_id, dataset_config, dataset_split, vulnerability_count):
@@ -73,37 +74,65 @@ def save_post(report_path, path, dataset_id, dataset_config, dataset_split):
     with open(path, "w") as f:
         f.write(post)
 
-
-def separate_report_by_issues(report):
-    # TODO: add markdown comments to the report as a split marker
-    regex = (
-        "\W(?="
-        + "|".join(["<details>\n<summary>👉" + issue for issue in ISSUE_GROUPS])
-        + ")"
-    )
-    sub_reports = re.split(regex, report)
-    return sub_reports
-
+class Issue:
+    def __init__(self, description, examples):
+        self.description = description
+        self.examples = examples
+    
+    def __len__(self):
+        return len(self.description) + len(self.examples)
+    
+    def trim_examples(self):
+        # get characters count of the examples
+        if len(self.examples) > 60000:
+            self.examples = "examples are too long to be displayed"
+
+def load_report_to_issues(report):
+    splited_issues = []
+    # <!-- issue --> is used to separate the issues
+    issues = [ issue for issue in report.split("<!-- issue -->") if len(issue) > 0 ]
+    # <!-- examples --> is used to separate the examples
+    for issue in issues:
+      descriptions = []
+      examples = []
+      splited_issue = issue.split("<!-- examples -->")
+      descriptions.append(splited_issue[0])
+      for sub_issue in splited_issue[1:]:
+          res = sub_issue.split("</details>")
+          for i in range(0, len(res), 2):
+              if len(res[i]) == 0 or len(set(res[i])) < 10:
+                  continue
+              examples.append(res[i])
+              if i + 1 < len(res):
+                  descriptions.append(res[i + 1])
+      splited_issues.extend([Issue(description, example) for description, example in zip(descriptions, examples)])
+    return splited_issues
 
 def post_issue_as_comment(discussion, issue, token, repo_id):
-    comment = hf_hub.comment_discussion(
-        repo_id=repo_id,
-        repo_type="space",
-        discussion_num=discussion.num,
-        comment=issue,
-        token=token,
-    )
-    return comment
+    try:
+      comment = issue
+      if isinstance(issue, Issue):
+          comment = issue.description + issue.examples
+      hf_hub.comment_discussion(
+          repo_id=repo_id,
+          repo_type="space",
+          discussion_num=discussion.num,
+          comment=comment,
+          token=token,
+      )
+    except Exception as e:
+        logger.debug(f"Failed to post issue as comment: {e}")
 
 
 def post_too_long_report_in_comments(
     discussion, report, token, repo_id, test_suite_url=None
 ):
-    sub_reports = separate_report_by_issues(report)
-
-    for issue in sub_reports:
+    issues = load_report_to_issues(report)
+    for issue in issues:
+        if len(issue) > 60000:
+            issue.trim_examples()
         post_issue_as_comment(discussion, issue, token, repo_id)
-
+        sleep(1)
     post_issue_as_comment(discussion, construct_closing(test_suite_url), token, repo_id)
     return discussion
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -11,7 +11,7 @@ version = "0.2.0"
 readme = "README.md"
 dependencies = [
     "datasets",
-    "giskard >= 2.3.0",
+    "giskard >= 2.7.3",
     "huggingface_hub",
     "torch",
     "transformers",