Feedback Updates

Bridget Allen · Bridget Allen · commit dff972c333c6 · 2025-07-23T17:52:01.000Z
- adheres to PEP 8
- Imports are always put at the top of the file"
- Line lengths are appropriate
- Remove references to ExampleReadsApp
- remove commented out lines
diff --git a/lib/kb_bedtools/kb_bedtoolsImpl.py b/lib/kb_bedtools/kb_bedtoolsImpl.py
@@ -1,16 +1,16 @@
 # -*- coding: utf-8 -*-
 #BEGIN_HEADER
-import json
 import logging
 import os
 import subprocess
 
 from installed_clients.DataFileUtilClient import DataFileUtil
 from installed_clients.KBaseReportClient import KBaseReport
 from installed_clients.ReadsUtilsClient import ReadsUtils
-from .utils import ExampleReadsApp, BamConversion, Intersection
 from base import Core
 
+from kb_bedtools.utils import Intersection
+from kb_bedtools.utils import BamConversion
 
 #END_HEADER
 
@@ -51,7 +51,6 @@ def __init__(self, config):
 
 
     def run_kb_bedtools(self, ctx, params):
-        import subprocess
         version = subprocess.check_output(["bedtools", "--version"])
         print("BEDTOOLS VERSION IN CONTAINER:", version.decode())
 
@@ -75,21 +74,7 @@ def run_kb_bedtools(self, ctx, params):
             ),
         )
         bam = BamConversion(ctx, config=config, app_config=self.config)
-        #bam.bam_to_fastq(params['bam_file'], config['shared_folder'])
         output = bam.do_analysis(params)
-        #fastq_path = bam.bam_to_fastq(params['bam_file'])        #ExampleReadsApp.upload_reads(self, params['name'], params['reads_path'], params['wsname']) 
-        #era = ExampleReadsApp(ctx, config=config)
-        #era.upload_reads(params["bam_file"], params["read_ref"], params["workspace_name"])
-    
-        #out_path = os.path.join(self.shared_folder, 'filename_end1')
-        #logging.warning(f">>>>>>>>>>>>>>>>>>>>{fastq_path}")
-        # bam.upload_reads(params['output_name'], fastq_path, params['workspace_name']) 
-
-        #ExampleReadsApp.upload_reads(self, params['name'], params['reads_path'], params['wsname']) #might not need this
-        # Download Reads
-
-        #era = ExampleReadsApp(ctx, config=config)
-        #output = era.do_analysis(params)
 
         output = bam.do_analysis(params)
 
@@ -98,7 +83,6 @@ def run_kb_bedtools(self, ctx, params):
             raise ValueError('Method run_kb_bedtools return value ' +
                              'output is not type dict as required.')
         # return the results
-        print("RETURNING:", output)  # Must print this
         return [output]
         #END run_kb_bedtools
     def run_kb_bedtools_intersect(self, ctx, params):
diff --git a/lib/kb_bedtools/utils.py b/lib/kb_bedtools/utils.py
@@ -1,219 +1,46 @@
-"""
-This ExampleReadsApp demonstrates how to use best practices for KBase App
-development using the SFA base package.
-"""
 import json
 import io
 import logging
 import os
 import subprocess
-import uuid
 
 from collections import Counter
 from shutil import copyfile
 
-import pandas as pd
 import subprocess
 
 from Bio import SeqIO
 
-# This is the SFA base package which provides the Core app class.
 from base import Core
 
 MODULE_DIR = "/kb/module"
 TEMPLATES_DIR = os.path.join(MODULE_DIR, "lib/templates")
 
 
-class ExampleReadsApp(Core):
-    def __init__(self, ctx, config, clients_class=None):
-        """
-        This is required to instantiate the Core App class with its defaults
-        and allows you to pass in more clients as needed.
-        """
-        super().__init__(ctx, config, clients_class)
-        # Here we adjust the instance attributes for our convenience.
-        self.report = self.clients.KBaseReport
-        self.ru = self.clients.ReadsUtils
-        # self.shared_folder is defined in the Core App class.
-        # TODO Add a self.wsid = a conversion of self.wsname
-
-    #def do_analysis(self, params: dict):
-    #    """
-    #    This method is where the main computation will occur.
-    #    """
-    #    read_refs = params["reads_ref"]
-    #    # Download the reads from KBase
-    #    ret = self.download_reads(read_refs)
-    #    # We use these downloaded reads and biopython to collect the first 10
-    #    # reads and their phred quality scores to create a new fastq file to
-    #    # upload to KBase.
-    #    for file_ref, file_info in ret["files"].items():
-    #        file_path = file_info["files"]["fwd"]
-    #        basename = os.path.basename(file_path)
-    #        with open(file_path) as reads:
-    #            record_iter = SeqIO.parse(reads, "fastq")
-    #            limit = 10
-    #            head = []
-    #            scores = []
-    #            counts = Counter()
-    #            for ix, record in enumerate(record_iter):
-    #                if ix >= limit:
-    #                    break
-    #                head.append(record)
-    #                counts.update(str(record.seq))
-    #                scores.append(record.letter_annotations["phred_quality"])
-    #            filename = f"{basename}.head.fastq"
-    #            out_path = os.path.join(self.shared_folder, filename)
-    #            with open(out_path, "w") as out_reads:
-    #                SeqIO.write(head, out_reads, "fastq")
-#
-    #    # This method runs the process first and then returns the stdout and
-    #    # stderr all at once, so take care if your process produces a large
-    #    # amount of output.
-    #    process = subprocess.Popen(
-    #        ["/kb/module/scripts/random_logger.py"],
-    #        stdout=subprocess.PIPE,
-    #        stderr=subprocess.PIPE,
-    #    )
-#
-    #    stdout, stderr = self.get_streams(process)
-    #    # We are logging everything because the script we are running does not
-    #    # have a lot of output, but if what you run does then you might not
-    #    # want to log *everything* to the user.
-    #    logging.info(stdout)
-    #    if stderr:
-    #        logging.warning(stderr)
-    #    output_value = stdout.split("\n")[0].split(" ")[-2]
-    #    count_df = pd.DataFrame(sorted(counts.items()), columns=["base", "count"])
-#
-    #    # Upload the first 10 reads back to kbase as an object
-    #    upa = self.upload_reads(
-    #        name=params["output_name"], reads_path=out_path, wsname=params["workspace_name"]
-    #    )
-#
-    #    # Pass new data to generate the report.
-    #    params["count_df"] = count_df
-    #    params["output_value"] = output_value
-    #    params["scores"] = scores
-    #    params["upa"] = upa  # Not currently used, but the ID of the uploaded reads
-    #    # This is the method that generates the HTML report
-    #    return self.generate_report(params)
-#
-    @staticmethod
-    def get_streams(process):
-        """
-        Returns decoded stdout,stderr after loading the entire thing into memory
-        """
-        stdout, stderr = process.communicate()
-        return (stdout.decode("utf-8", "ignore"), stderr.decode("utf-8", "ignore"))
-
-    def upload_reads(self, name, reads_path, wsname):
-        """
-        Upload reads back to the KBase Workspace. This method only uses the
-        minimal parameters necessary to provide a demonstration. There are many
-        more parameters which reads can provide, for example, interleaved, etc.
-        By default, non-interleaved objects and those uploaded without a
-        reverse file are saved as KBaseFile.SingleEndLibrary. See:
-        https://githusb.com/kbaseapps/ReadsUtils/blob/master/lib/ReadsUtils/ReadsUtilsImpl.py#L115-L119
-        param: filepath_to_reads - A filepath to a fastq fastq file to upload reads from
-        param: wsname - The name of the workspace to upload to
-        """
-        ur_params = {
-            "fwd_file": reads_path,
-            "name": name,
-            "sequencing_tech": "Illumina",
-            "wsname": wsname,
-            "single_genome": 0,
-        }
-        # It is often useful to log parameters as they are passed.
-        logging.warning(f">>>>>>>>>>>>>>>>>>>>{ur_params}")
-        return self.ru.upload_reads(ur_params)
-
-    def download_reads(self, reads_ref, interleaved=False):
-        """
-        Download a list of reads objects
-        param: reads_ref - A list of reads references/upas
-        """
-        dr_params = {"read_libraries": [reads_ref], "interleaved": None}
-        # This uses the ReadsUtils client to download a specific workspace
-        # object, saving it into the shared_folder and making it available to
-        # the user.
-        return self.ru.download_reads(dr_params)
-
-    def generate_report(self, params: dict):
-        """
-        This method is where to define the variables to pass to the report.
-        """
-        # This path is required to properly use the template.
-        reports_path = os.path.join(self.shared_folder, "reports")
-        # Path to the Jinja template. The template can be adjusted to change
-        # the report.
-        template_path = os.path.join(TEMPLATES_DIR, "report.html")
-        # A sample multiplication table to use as output
-        table = [[i * j for j in range(10)] for i in range(10)]
-        headers = "one two three four five six seven eight nine ten".split(" ")
-        # A count of the base calls in the reads
-        count_df_html = params["count_df"].to_html()
-        # Calculate a correlation table determined by the quality scores of
-        # each base read. This requires pandas and matplotlib, and these are
-        # listed in requirements.txt. You can see the resulting HTML file after
-        # runing kb-sdk test in ./test_local/workdir/tmp/reports/index.html
-        scores_df_html = (
-            pd.DataFrame(params["scores"]).corr().style.background_gradient().render()
-        )
-        # The keys in this dictionary will be available as variables in the
-        # Jinja template. With the current configuration of the template
-        # engine, HTML output is allowed.
-        template_variables = dict(
-            count_df_html=count_df_html,
-            headers=headers,
-            scores_df_html=scores_df_html,
-            table=table,
-            upa=params["upa"],
-            output_value=params["output_value"],
-        )
-        # The KBaseReport configuration dictionary
-        config = dict(
-            report_name=f"ExampleReadsApp_{str(uuid.uuid4())}",
-            reports_path=reports_path,
-            template_variables=template_variables,
-            workspace_name=params["workspace_name"],
-        )
-        return self.create_report_from_template(template_path, config)
-
 class BamConversion(Core):
     def __init__(self, ctx, config, app_config, clients_class=None):
         """
         This is required to instantiate the Core App class with its defaults
         and allows you to pass in more clients as needed.
         """
         super().__init__(ctx, config, clients_class)
-        # Here we adjust the instance attributes for our convenience.
         self.dfu = self.clients.DataFileUtil
         self.report = self.clients.KBaseReport
         self.ru = self.clients.ReadsUtils
         self.app_config = app_config
-        # self.shared_folder is defined in the Core App class.
-        # TODO Add a self.wsid = a conversion of self.wsname
 
     def do_analysis(self, params: dict):
         """
         This method is where the main computation will occur.
         """
         print(f"{json.dumps(params)=}")
         bam_file = params['bam_file']
-        staging_path = bam_file if os.path.isfile(bam_file) else os.path.join("/staging/", bam_file)
-        # Read and print first 1000 characters
-
-
+        if os.path.isfile(bam_file):
+            staging_path = bam_file
+        else:
+            staging_path = os.path.join("/staging/", bam_file)
         
-        logging.warning(f"{'@'*30} params: {params}")
         logging.warning(f"cwd: {os.getcwd()}")
-        #bam_file_staging_path = self.dfu.download_staging_file({
-            # 'staging_file_subdir_path': bam_file
-        #}).get('copy_file_path')
-        #logging.warning(f'{"&"*20}{bam_file_staging_path=}')
-        #logging.warning(f"bam_file_staging_path: {bam_file_staging_path}")
         output_name = params['output_name']
         wsname = params['workspace_name']
         sequencing_tech = 'Illumina'
@@ -256,23 +83,9 @@ def bam_to_fastq(cls, bam_file, shared_folder=""): # add a dict parameter so tho
         if os.path.getsize("filename_end1.fq") < 100:
             raise ValueError("Generated FASTQ file is unexpectedly small — check input BAM or bedtools error")
 
-        with open("filename_end1.fq", 'rb') as f:
-            content = f.read(1001)
-            print("First 1001 characters from the file:")
-            decoded = "".join([c if ord(c)>=32 else "?" for c in content.decode("ascii", "ignore")])
-            print(f"{decoded=}")
-
         output_path = os.path.join(shared_folder, 'output.fq')
         copyfile('filename_end1.fq', output_path)
-        # Upload the fastq file we just made to a reads object in KBase
-        # upa = self.upload_reads(
-        #     name=params["output_name"], reads_path=out_path, wsname=params["workspace_name"]
-        # )
-        #logging.warning(f">>>>>>>>>>>>>>>>>>>>{os.getcwd()}")
-        #fastq_path = '/kb/module/test/filename_end1.fq'
-        #fastq_file = open(fastq_path, 'r')
-        #print(fastq_file.read())
-
+        
         return output_path
     
 
@@ -295,7 +108,6 @@ def upload_reads(self, name, reads_path, workspace_name, sequencing_tech, interl
             "interleaved": interleaved
             #"single_genome": single_genome
         }
-        # It is often useful to log parameters as they are passed.
         logging.warning(f">>>>>>>>>>>>>>>>>>>>{ur_params}")
         return self.ru.upload_reads(ur_params)
     
@@ -306,11 +118,8 @@ def __init__(self, ctx, config, clients_class=None):
         and allows you to pass in more clients as needed.
         """
         super().__init__(ctx, config, clients_class)
-        # Here we adjust the instance attributes for our convenience.
         self.report = self.clients.KBaseReport
         self.ru = self.clients.ReadsUtils
-        # self.shared_folder is defined in the Core App class.
-        # TODO Add a self.wsid = a conversion of self.wsname
     
     def intersection(self, first_file, second_file):
         file1 = first_file
diff --git a/requirements.txt b/requirements.txt
@@ -3,4 +3,3 @@ matplotlib==3.3.4
 pandas==1.1.5
 pytest==7.1.1
 pytest-cov==3.0.0
-pysam>=0.19.0
diff --git a/test/aligned.bam b/test/aligned.bam
diff --git a/test/kb_bedtools_server_test.py b/test/kb_bedtools_server_test.py
@@ -6,11 +6,9 @@
 import subprocess
 import time
 import unittest
-import unittest
 
 from configparser import ConfigParser
 from shutil import copyfile
-from unittest.mock import patch
 
 from kb_bedtools.kb_bedtoolsImpl import kb_bedtools
 from kb_bedtools.kb_bedtoolsServer import MethodContext
@@ -77,41 +75,36 @@ def tearDownClass(cls):
             print("Test workspace was deleted")
 
     def copy_bam_to_scratch(self):
-        bam_src = os.path.join(os.path.dirname(__file__), "aligned.bam")
-        bam_dst = os.path.join(self.scratch, "aligned.bam")
+        bam_src = os.path.join(os.path.dirname(__file__), "minimal.bam")
+        bam_dst = os.path.join(self.scratch, "minimal.bam")
 
         shutil.copy(bam_src, bam_dst)
         print(f"Copied BAM file to scratch: {bam_dst}")
         return bam_dst
 
-
-    # NOTE: According to Python unittest naming rules test method names should start from 'test'. # noqa
-    # @unittest.skip("Skip test for debugging")
-    # Now when run_kb_bedtools calls download_staging_file, it uses your mock
-    @patch.object(DataFileUtil, "download_staging_file", side_effect=mock_download_staging_file)
-    def test_your_method(self, mock_download):
-        # Prepare test objects in workspace if needed using
-        # self.getWsClient().save_objects({'workspace': self.getWsName(),
-        #                                  'objects': []})
-        #
-        # Run your method by
-        # ret = self.getImpl().your_method(self.getContext(), parameters...)
-        #
-        # Check returned data with
-        # self.assertEqual(ret[...], ...) or other unittest methods
+    def test_intersect(self):
+        # in the test, use print() to put things in stdout
+        first_file = 'GSE203496_xmoo1_line_pooled_assembly.gff'
+        second_file = 'GSE240325_apo_rbfox_insitu_clustered.sorted.filtered_lite.gff'
+        self.serviceImpl.run_kb_bedtools_intersect(
+            self.ctx,
+            {
+                "workspace_name": self.wsName,
+                "first_file" : first_file,
+                "second_file" : second_file,
+                "output_name": "intersectOutput",
+            })
 
         params = {
             "workspace_name": self.wsName,
             "reads_ref": "70257/2/1",
             "output_name": "ReadsOutputName",
             "interleaved": True,
-            "bam_file": "aligned.bam",
+            "bam_file": "minimal.bam",
             "fastq_path_name": os.path.join("/kb/module/work/tmp", "filename_end2.fq"),
         }
 
         ret = self.serviceImpl.run_kb_bedtools(self.ctx, params)
 
-        print("REPORT:", ret)
-
         self.assertIn("report_name", ret[0])
         self.assertIn("report_ref", ret[0])
diff --git a/test/unit_tests/test_kb_bedtools_utils.py b/test/unit_tests/test_kb_bedtools_utils.py