Skip to content
Merged
3 changes: 1 addition & 2 deletions Lib/pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,8 +330,7 @@ def _parse_path(cls, path):
elif len(drv_parts) == 6:
# e.g. //?/unc/server/share
root = sep
unfiltered_parsed = [drv + root] + rel.split(sep)
parsed = [sys.intern(x) for x in unfiltered_parsed if x and x != '.']
parsed = [sys.intern(str(x)) for x in rel.split(sep) if x and x != '.']
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoa, we sys.intern everything? That seems like a painful memory leak. Would we be better off with a class level lru_cache'd function1?

Don't we already know that rel will be str here? If it's bytes, str(x) is the wrong conversion anyway. Maybe the intern function should be cls._flavour.str_and_intern?

Footnotes

  1. @lru_cache(max_size=REASONABLE_VALUE) def intern(x): return x

Copy link
Contributor Author

@barneygale barneygale Apr 11, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The interning of the path parts is longstanding pathlib behaviour. Honestly I don't know enough about the benefits/drawbacks of interning to say whether it's reasonable.

Don't we already know that rel will be str here?

We know that it's an instance of str (and not bytes), but we don't know whether it's a true str object (required by sys.intern()).

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Whoa, we sys.intern everything? That seems like a painful memory leak.

Is it really all that bad? sys.intern() calls PyUnicode_InternInPlace(), not PyUnicode_InternImmortal(). An interned string gets deallocated based on its reference count, like any other object. For example:

>>> s = sys.intern('spam and eggs')
>>> t = sys.intern('spam and eggs')
>>> s is t
True
>>> id(s)
139835455199152
>>> del s, t
>>> s = sys.intern('spam and eggs')
>>> id(s)
139835455198912

return drv, root, parsed

def _load_parts(self):
Expand Down
144 changes: 17 additions & 127 deletions Lib/test/test_pathlib.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,133 +24,6 @@
grp = pwd = None


class _BaseFlavourTest(object):

def _check_parse_parts(self, arg, expected):
def f(parts):
path = self.cls(*parts)._raw_path
return self.cls._parse_path(path)
sep = self.flavour.sep
altsep = self.flavour.altsep
actual = f([x.replace('/', sep) for x in arg])
self.assertEqual(actual, expected)
if altsep:
actual = f([x.replace('/', altsep) for x in arg])
self.assertEqual(actual, expected)

def test_parse_parts_common(self):
check = self._check_parse_parts
sep = self.flavour.sep
# Unanchored parts.
check([], ('', '', []))
check(['a'], ('', '', ['a']))
check(['a/'], ('', '', ['a']))
check(['a', 'b'], ('', '', ['a', 'b']))
# Expansion.
check(['a/b'], ('', '', ['a', 'b']))
check(['a/b/'], ('', '', ['a', 'b']))
check(['a', 'b/c', 'd'], ('', '', ['a', 'b', 'c', 'd']))
# Collapsing and stripping excess slashes.
check(['a', 'b//c', 'd'], ('', '', ['a', 'b', 'c', 'd']))
check(['a', 'b/c/', 'd'], ('', '', ['a', 'b', 'c', 'd']))
# Eliminating standalone dots.
check(['.'], ('', '', []))
check(['.', '.', 'b'], ('', '', ['b']))
check(['a', '.', 'b'], ('', '', ['a', 'b']))
check(['a', '.', '.'], ('', '', ['a']))
# The first part is anchored.
check(['/a/b'], ('', sep, [sep, 'a', 'b']))
check(['/a', 'b'], ('', sep, [sep, 'a', 'b']))
check(['/a/', 'b'], ('', sep, [sep, 'a', 'b']))
# Ignoring parts before an anchored part.
check(['a', '/b', 'c'], ('', sep, [sep, 'b', 'c']))
check(['a', '/b', '/c'], ('', sep, [sep, 'c']))


class PosixFlavourTest(_BaseFlavourTest, unittest.TestCase):
cls = pathlib.PurePosixPath
flavour = pathlib.PurePosixPath._flavour

def test_parse_parts(self):
check = self._check_parse_parts
# Collapsing of excess leading slashes, except for the double-slash
# special case.
check(['//a', 'b'], ('', '//', ['//', 'a', 'b']))
check(['///a', 'b'], ('', '/', ['/', 'a', 'b']))
check(['////a', 'b'], ('', '/', ['/', 'a', 'b']))
# Paths which look like NT paths aren't treated specially.
check(['c:a'], ('', '', ['c:a']))
check(['c:\\a'], ('', '', ['c:\\a']))
check(['\\a'], ('', '', ['\\a']))


class NTFlavourTest(_BaseFlavourTest, unittest.TestCase):
cls = pathlib.PureWindowsPath
flavour = pathlib.PureWindowsPath._flavour

def test_parse_parts(self):
check = self._check_parse_parts
# First part is anchored.
check(['c:'], ('c:', '', ['c:']))
check(['c:/'], ('c:', '\\', ['c:\\']))
check(['/'], ('', '\\', ['\\']))
check(['c:a'], ('c:', '', ['c:', 'a']))
check(['c:/a'], ('c:', '\\', ['c:\\', 'a']))
check(['/a'], ('', '\\', ['\\', 'a']))
# UNC paths.
check(['//'], ('\\\\', '', ['\\\\']))
check(['//a'], ('\\\\a', '', ['\\\\a']))
check(['//a/'], ('\\\\a\\', '', ['\\\\a\\']))
check(['//a/b'], ('\\\\a\\b', '\\', ['\\\\a\\b\\']))
check(['//a/b/'], ('\\\\a\\b', '\\', ['\\\\a\\b\\']))
check(['//a/b/c'], ('\\\\a\\b', '\\', ['\\\\a\\b\\', 'c']))
# Second part is anchored, so that the first part is ignored.
check(['a', 'Z:b', 'c'], ('Z:', '', ['Z:', 'b', 'c']))
check(['a', 'Z:/b', 'c'], ('Z:', '\\', ['Z:\\', 'b', 'c']))
# UNC paths.
check(['a', '//b/c', 'd'], ('\\\\b\\c', '\\', ['\\\\b\\c\\', 'd']))
# Collapsing and stripping excess slashes.
check(['a', 'Z://b//c/', 'd/'], ('Z:', '\\', ['Z:\\', 'b', 'c', 'd']))
# UNC paths.
check(['a', '//b/c//', 'd'], ('\\\\b\\c', '\\', ['\\\\b\\c\\', 'd']))
# Extended paths.
check(['//./c:'], ('\\\\.\\c:', '', ['\\\\.\\c:']))
check(['//?/c:/'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\']))
check(['//?/c:/a'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\', 'a']))
check(['//?/c:/a', '/b'], ('\\\\?\\c:', '\\', ['\\\\?\\c:\\', 'b']))
# Extended UNC paths (format is "\\?\UNC\server\share").
check(['//?'], ('\\\\?', '', ['\\\\?']))
check(['//?/'], ('\\\\?\\', '', ['\\\\?\\']))
check(['//?/UNC'], ('\\\\?\\UNC', '', ['\\\\?\\UNC']))
check(['//?/UNC/'], ('\\\\?\\UNC\\', '', ['\\\\?\\UNC\\']))
check(['//?/UNC/b'], ('\\\\?\\UNC\\b', '', ['\\\\?\\UNC\\b']))
check(['//?/UNC/b/'], ('\\\\?\\UNC\\b\\', '', ['\\\\?\\UNC\\b\\']))
check(['//?/UNC/b/c'], ('\\\\?\\UNC\\b\\c', '\\', ['\\\\?\\UNC\\b\\c\\']))
check(['//?/UNC/b/c/'], ('\\\\?\\UNC\\b\\c', '\\', ['\\\\?\\UNC\\b\\c\\']))
check(['//?/UNC/b/c/d'], ('\\\\?\\UNC\\b\\c', '\\', ['\\\\?\\UNC\\b\\c\\', 'd']))
# UNC device paths
check(['//./BootPartition/'], ('\\\\.\\BootPartition', '\\', ['\\\\.\\BootPartition\\']))
check(['//?/BootPartition/'], ('\\\\?\\BootPartition', '\\', ['\\\\?\\BootPartition\\']))
check(['//./PhysicalDrive0'], ('\\\\.\\PhysicalDrive0', '', ['\\\\.\\PhysicalDrive0']))
check(['//?/Volume{}/'], ('\\\\?\\Volume{}', '\\', ['\\\\?\\Volume{}\\']))
check(['//./nul'], ('\\\\.\\nul', '', ['\\\\.\\nul']))
# Second part has a root but not drive.
check(['a', '/b', 'c'], ('', '\\', ['\\', 'b', 'c']))
check(['Z:/a', '/b', 'c'], ('Z:', '\\', ['Z:\\', 'b', 'c']))
check(['//?/Z:/a', '/b', 'c'], ('\\\\?\\Z:', '\\', ['\\\\?\\Z:\\', 'b', 'c']))
# Joining with the same drive => the first path is appended to if
# the second path is relative.
check(['c:/a/b', 'c:x/y'], ('c:', '\\', ['c:\\', 'a', 'b', 'x', 'y']))
check(['c:/a/b', 'c:/x/y'], ('c:', '\\', ['c:\\', 'x', 'y']))
# Paths to files with NTFS alternate data streams
check(['./c:s'], ('', '', ['c:s']))
check(['cc:s'], ('', '', ['cc:s']))
check(['C:c:s'], ('C:', '', ['C:', 'c:s']))
check(['C:/c:s'], ('C:', '\\', ['C:\\', 'c:s']))
check(['D:a', './c:b'], ('D:', '', ['D:', 'a', 'c:b']))
check(['D:/a', './c:b'], ('D:', '\\', ['D:\\', 'a', 'c:b']))


#
# Tests for the pure classes.
#
Expand Down Expand Up @@ -937,6 +810,9 @@ def test_drive_root_parts(self):
check(('c:/a',), 'c:', '\\', ('c:\\', 'a'))
check(('/a',), '', '\\', ('\\', 'a'))
# UNC paths.
check(('//',), '\\\\', '', ('\\\\',))
check(('//a',), '\\\\a', '', ('\\\\a',))
check(('//a/',), '\\\\a\\', '', ('\\\\a\\',))
check(('//a/b',), '\\\\a\\b', '\\', ('\\\\a\\b\\',))
check(('//a/b/',), '\\\\a\\b', '\\', ('\\\\a\\b\\',))
check(('//a/b/c',), '\\\\a\\b', '\\', ('\\\\a\\b\\', 'c'))
Expand All @@ -950,12 +826,26 @@ def test_drive_root_parts(self):
# UNC paths.
check(('a', '//b/c//', 'd'), '\\\\b\\c', '\\', ('\\\\b\\c\\', 'd'))
# Extended paths.
check(('//./c:',), '\\\\.\\c:', '', ('\\\\.\\c:',))
check(('//?/c:/',), '\\\\?\\c:', '\\', ('\\\\?\\c:\\',))
check(('//?/c:/a',), '\\\\?\\c:', '\\', ('\\\\?\\c:\\', 'a'))
check(('//?/c:/a', '/b'), '\\\\?\\c:', '\\', ('\\\\?\\c:\\', 'b'))
# Extended UNC paths (format is "\\?\UNC\server\share").
check(('//?',), '\\\\?', '', ('\\\\?',))
check(('//?/',), '\\\\?\\', '', ('\\\\?\\',))
check(('//?/UNC',), '\\\\?\\UNC', '', ('\\\\?\\UNC',))
check(('//?/UNC/',), '\\\\?\\UNC\\', '', ('\\\\?\\UNC\\',))
check(('//?/UNC/b',), '\\\\?\\UNC\\b', '', ('\\\\?\\UNC\\b',))
check(('//?/UNC/b/',), '\\\\?\\UNC\\b\\', '', ('\\\\?\\UNC\\b\\',))
check(('//?/UNC/b/c',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\',))
check(('//?/UNC/b/c/',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\',))
check(('//?/UNC/b/c/d',), '\\\\?\\UNC\\b\\c', '\\', ('\\\\?\\UNC\\b\\c\\', 'd'))
# UNC device paths
check(('//./BootPartition/',), '\\\\.\\BootPartition', '\\', ('\\\\.\\BootPartition\\',))
check(('//?/BootPartition/',), '\\\\?\\BootPartition', '\\', ('\\\\?\\BootPartition\\',))
check(('//./PhysicalDrive0',), '\\\\.\\PhysicalDrive0', '', ('\\\\.\\PhysicalDrive0',))
check(('//?/Volume{}/',), '\\\\?\\Volume{}', '\\', ('\\\\?\\Volume{}\\',))
check(('//./nul',), '\\\\.\\nul', '', ('\\\\.\\nul',))
# Second part has a root but not drive.
check(('a', '/b', 'c'), '', '\\', ('\\', 'b', 'c'))
check(('Z:/a', '/b', 'c'), 'Z:', '\\', ('Z:\\', 'b', 'c'))
Expand Down
You are viewing a condensed version of this merge commit. You can view the full changes here.