Merge pull request #187 from obsidianforensics/up-verisons-and-clean-hostnames

obsidianforensics · web-flow · commit c014a0f439b4 · 2024-10-06T21:31:01.000-07:00
Up the max Chrome version to 129. Add `get_clean_hostnames` function …
diff --git a/pyhindsight/browsers/chrome.py b/pyhindsight/browsers/chrome.py
@@ -116,7 +116,7 @@ def determine_version(self):
         Based on research I did to create "Chrome Evolution" tool - dfir.blog/chrome-evolution
         """
 
-        possible_versions = list(range(1, 125))
+        possible_versions = list(range(1, 130))
         # TODO: remove 82?
         previous_possible_versions = possible_versions[:]
 
@@ -207,6 +207,7 @@ def trim_lesser_versions(version):
                 trim_lesser_versions_if('is_persistent', self.structure['Cookies']['cookies'], 66)
                 trim_lesser_versions_if('encrypted_value', self.structure['Cookies']['cookies'], 33)
                 trim_lesser_versions_if('priority', self.structure['Cookies']['cookies'], 28)
+                trim_lesser_versions_if('source_type', self.structure['Cookies']['cookies'], 125)
             log.debug(f' - Finishing possible versions: {possible_versions}')
 
         possible_versions, previous_possible_versions = \
@@ -273,6 +274,10 @@ def trim_lesser_versions(version):
                     'key', self.structure['Network Action Predictor']['resource_prefetch_predictor_url'], 55)
                 trim_lesser_versions_if(
                     'proto', self.structure['Network Action Predictor']['resource_prefetch_predictor_url'], 54)
+            if 'lcp_critical_path_predictor' in list(self.structure['Network Action Predictor'].keys()):
+                trim_lesser_versions(117)
+            if 'lcp_critical_path_predictor_initiator_origin' in list(self.structure['Network Action Predictor'].keys()):
+                trim_lesser_versions(129)
             log.debug(f' - Finishing possible versions: {possible_versions}')
 
         possible_versions, previous_possible_versions = \
@@ -2223,39 +2228,7 @@ def get_site_characteristics(self, path, dir_name):
         self.parsed_artifacts.extend(result_list)
 
     def build_hsts_domain_hashes(self):
-        domains = set()
-        for artifact in self.parsed_artifacts:
-            if not isinstance(artifact, self.HistoryItem):
-                continue
-
-            if not artifact.url:
-                continue
-
-            artifact_url = artifact.url
-
-            # Some artifact "URLs" will be in invalid forms, which urllib (rightly)
-            # won't parse. Modify these URLs so they will parse properly.
-            # Examples:
-            #   Cookie: ".example.com",
-            #   Preferences (cookie_controls_metadata): "https://[*.]example.com"
-            prefixes = ('.', 'https://[*.]', 'http://[*.]')
-
-            for prefix in prefixes:
-                if artifact_url.startswith(prefix):
-                    artifact_url = 'http://' + artifact_url[len(prefix):]
-
-            if artifact_url.endswith(',*'):
-                artifact_url = artifact_url[:-2]
-
-            try:
-                domain = urllib.parse.urlparse(artifact_url).hostname
-            except ValueError as e:
-                log.warning(f'Error when parsing domain from {artifact_url}; {e}')
-                continue
-
-            # Some URLs don't have a domain, like local PDF files
-            if domain:
-                domains.add(domain)
+        domains = self.get_clean_hostnames()
 
         for domain in domains:
 
diff --git a/pyhindsight/browsers/webbrowser.py b/pyhindsight/browsers/webbrowser.py
@@ -111,17 +111,48 @@ def dict_factory(cursor, row):
             d[col[0]] = row[idx]
         return d
 
-    def build_md5_hash_list_of_origins(self):
+    def get_clean_hostnames(self):
+        hostnames = set()
         for artifact in self.parsed_artifacts:
-            if isinstance(artifact, self.HistoryItem):
+            if not isinstance(artifact, self.HistoryItem) or not artifact.url:
+                continue
+
+            # Some artifact "URLs", often parsed from Preferences, are two
+            # origins combined, so split them into two.
+            # Example from Preferences (3pcd_heuristics_grants):
+            #   "https://[*.]lnkd.in,https://[*.]linkedin.com"
+            artifact_urls = artifact.url.split(',')
+
+            for artifact_url in artifact_urls:
+                # Some artifact "URLs" will be in invalid forms, which urllib (rightly)
+                # won't parse. Modify these URLs so they will parse properly.
+                # Examples:
+                #   Cookie: ".example.com",
+                #   Preferences (cookie_controls_metadata): "https://[*.]example.com"
+                prefixes = ('.', 'https://[*.]', 'http://[*.]')
+
+                for prefix in prefixes:
+                    if artifact_url.startswith(prefix):
+                        artifact_url = 'https://' + artifact_url[len(prefix):]
+
+                if artifact_url.endswith(',*'):
+                    artifact_url = artifact_url[:-2]
+
                 try:
-                    domain = urllib.parse.urlparse(artifact.url).hostname
+                    hostname = urllib.parse.urlparse(artifact_url).hostname
                 except ValueError as e:
-                    log.warning(f'Error when parsing domain from {artifact.url}; {e}')
+                    log.warning(f'Error when parsing domain from {artifact_url}; {e}')
                     continue
+
                 # Some URLs don't have a domain, like local PDF files
-                if domain:
-                    self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain
+                if hostname:
+                    hostnames.add(hostname)
+        return hostnames
+
+    def build_md5_hash_list_of_origins(self):
+        domains = self.get_clean_hostnames()
+        for domain in domains:
+            self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain
 
     class HistoryItem(object):
         def __init__(self, item_type, timestamp, profile, url=None, name=None, value=None, interpretation=None):