Skip to content

Commit 096e9a1

Browse files
authored
Make glob consistent with glob.glob (#1382)
1 parent 71262d1 commit 096e9a1

File tree

6 files changed

+173
-290
lines changed

6 files changed

+173
-290
lines changed

fsspec/asyn.py

Lines changed: 18 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from .exceptions import FSTimeoutError
1616
from .implementations.local import LocalFileSystem, make_path_posix, trailing_sep
1717
from .spec import AbstractBufferedFile, AbstractFileSystem
18-
from .utils import is_exception, other_paths
18+
from .utils import glob_translate, is_exception, other_paths
1919

2020
private = re.compile("_[^_]")
2121
iothread = [None] # dedicated fsspec IO thread
@@ -745,8 +745,12 @@ async def _glob(self, path, maxdepth=None, **kwargs):
745745

746746
import re
747747

748-
ends = path.endswith("/")
748+
seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
749+
ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
749750
path = self._strip_protocol(path)
751+
append_slash_to_dirname = ends_with_sep or path.endswith(
752+
tuple(sep + "**" for sep in seps)
753+
)
750754
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
751755
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
752756
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
@@ -785,46 +789,22 @@ async def _glob(self, path, maxdepth=None, **kwargs):
785789
allpaths = await self._find(
786790
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
787791
)
788-
# Escape characters special to python regex, leaving our supported
789-
# special characters in place.
790-
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
791-
# for shell globbing details.
792-
pattern = (
793-
"^"
794-
+ (
795-
path.replace("\\", r"\\")
796-
.replace(".", r"\.")
797-
.replace("+", r"\+")
798-
.replace("//", "/")
799-
.replace("(", r"\(")
800-
.replace(")", r"\)")
801-
.replace("|", r"\|")
802-
.replace("^", r"\^")
803-
.replace("$", r"\$")
804-
.replace("{", r"\{")
805-
.replace("}", r"\}")
806-
.rstrip("/")
807-
.replace("?", ".")
808-
)
809-
+ "$"
810-
)
811-
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
812-
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
813-
pattern = re.sub("[*]", "[^/]*", pattern)
814-
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
815-
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
792+
793+
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
816794
pattern = re.compile(pattern)
795+
817796
out = {
818-
p: allpaths[p]
819-
for p in sorted(allpaths)
820-
if pattern.match(p.replace("//", "/").rstrip("/"))
797+
p: info
798+
for p, info in sorted(allpaths.items())
799+
if pattern.match(
800+
(
801+
p + "/"
802+
if append_slash_to_dirname and info["type"] == "directory"
803+
else p
804+
)
805+
)
821806
}
822807

823-
# Return directories only when the glob end by a slash
824-
# This is needed for posix glob compliance
825-
if ends:
826-
out = {k: v for k, v in out.items() if v["type"] == "directory"}
827-
828808
if detail:
829809
return out
830810
else:

fsspec/implementations/http.py

Lines changed: 21 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,13 @@
1414
from fsspec.callbacks import _DEFAULT_CALLBACK
1515
from fsspec.exceptions import FSTimeoutError
1616
from fsspec.spec import AbstractBufferedFile
17-
from fsspec.utils import DEFAULT_BLOCK_SIZE, isfilelike, nullcontext, tokenize
17+
from fsspec.utils import (
18+
DEFAULT_BLOCK_SIZE,
19+
glob_translate,
20+
isfilelike,
21+
nullcontext,
22+
tokenize,
23+
)
1824

1925
from ..caching import AllBytes
2026

@@ -441,8 +447,9 @@ async def _glob(self, path, maxdepth=None, **kwargs):
441447
raise ValueError("maxdepth must be at least 1")
442448
import re
443449

444-
ends = path.endswith("/")
450+
ends_with_slash = path.endswith("/") # _strip_protocol strips trailing slash
445451
path = self._strip_protocol(path)
452+
append_slash_to_dirname = ends_with_slash or path.endswith("/**")
446453
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
447454
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
448455

@@ -480,45 +487,22 @@ async def _glob(self, path, maxdepth=None, **kwargs):
480487
allpaths = await self._find(
481488
root, maxdepth=depth, withdirs=True, detail=True, **kwargs
482489
)
483-
# Escape characters special to python regex, leaving our supported
484-
# special characters in place.
485-
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
486-
# for shell globbing details.
487-
pattern = (
488-
"^"
489-
+ (
490-
path.replace("\\", r"\\")
491-
.replace(".", r"\.")
492-
.replace("+", r"\+")
493-
.replace("//", "/")
494-
.replace("(", r"\(")
495-
.replace(")", r"\)")
496-
.replace("|", r"\|")
497-
.replace("^", r"\^")
498-
.replace("$", r"\$")
499-
.replace("{", r"\{")
500-
.replace("}", r"\}")
501-
.rstrip("/")
502-
)
503-
+ "$"
504-
)
505-
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
506-
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
507-
pattern = re.sub("[*]", "[^/]*", pattern)
508-
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
509-
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
490+
491+
pattern = glob_translate(path + ("/" if ends_with_slash else ""))
510492
pattern = re.compile(pattern)
493+
511494
out = {
512-
p: allpaths[p]
513-
for p in sorted(allpaths)
514-
if pattern.match(p.replace("//", "/").rstrip("/"))
495+
p: info
496+
for p, info in sorted(allpaths.items())
497+
if pattern.match(
498+
(
499+
p + "/"
500+
if append_slash_to_dirname and info["type"] == "directory"
501+
else p
502+
)
503+
)
515504
}
516505

517-
# Return directories only when the glob end by a slash
518-
# This is needed for posix glob compliance
519-
if ends:
520-
out = {k: v for k, v in out.items() if v["type"] == "directory"}
521-
522506
if detail:
523507
return out
524508
else:

fsspec/spec.py

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from .transaction import Transaction
1818
from .utils import (
1919
_unstrip_protocol,
20+
glob_translate,
2021
isfilelike,
2122
other_paths,
2223
read_block,
@@ -551,19 +552,19 @@ def glob(self, path, maxdepth=None, **kwargs):
551552
552553
The `maxdepth` option is applied on the first `**` found in the path.
553554
554-
Search path names that contain embedded characters special to this
555-
implementation of glob may not produce expected results;
556-
e.g., ``foo/bar/*starredfilename*``.
557-
558555
kwargs are passed to ``ls``.
559556
"""
560557
if maxdepth is not None and maxdepth < 1:
561558
raise ValueError("maxdepth must be at least 1")
562559

563560
import re
564561

565-
ends = path.endswith("/")
562+
seps = (os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)
563+
ends_with_sep = path.endswith(seps) # _strip_protocol strips trailing slash
566564
path = self._strip_protocol(path)
565+
append_slash_to_dirname = ends_with_sep or path.endswith(
566+
tuple(sep + "**" for sep in seps)
567+
)
567568
idx_star = path.find("*") if path.find("*") >= 0 else len(path)
568569
idx_qmark = path.find("?") if path.find("?") >= 0 else len(path)
569570
idx_brace = path.find("[") if path.find("[") >= 0 else len(path)
@@ -600,47 +601,22 @@ def glob(self, path, maxdepth=None, **kwargs):
600601
depth = None
601602

602603
allpaths = self.find(root, maxdepth=depth, withdirs=True, detail=True, **kwargs)
603-
# Escape characters special to python regex, leaving our supported
604-
# special characters in place.
605-
# See https://www.gnu.org/software/bash/manual/html_node/Pattern-Matching.html
606-
# for shell globbing details.
607-
pattern = (
608-
"^"
609-
+ (
610-
path.replace("\\", r"\\")
611-
.replace(".", r"\.")
612-
.replace("+", r"\+")
613-
.replace("//", "/")
614-
.replace("(", r"\(")
615-
.replace(")", r"\)")
616-
.replace("|", r"\|")
617-
.replace("^", r"\^")
618-
.replace("$", r"\$")
619-
.replace("{", r"\{")
620-
.replace("}", r"\}")
621-
.rstrip("/")
622-
.replace("?", ".")
623-
)
624-
+ "$"
625-
)
626-
pattern = re.sub("/[*]{2}", "=SLASH_DOUBLE_STARS=", pattern)
627-
pattern = re.sub("[*]{2}/?", "=DOUBLE_STARS=", pattern)
628-
pattern = re.sub("[*]", "[^/]*", pattern)
629-
pattern = re.sub("=SLASH_DOUBLE_STARS=", "(|/.*)", pattern)
630-
pattern = re.sub("=DOUBLE_STARS=", ".*", pattern)
604+
605+
pattern = glob_translate(path + ("/" if ends_with_sep else ""))
631606
pattern = re.compile(pattern)
632607

633608
out = {
634-
p: allpaths[p]
635-
for p in sorted(allpaths)
636-
if pattern.match(p.replace("//", "/").rstrip("/"))
609+
p: info
610+
for p, info in sorted(allpaths.items())
611+
if pattern.match(
612+
(
613+
p + "/"
614+
if append_slash_to_dirname and info["type"] == "directory"
615+
else p
616+
)
617+
)
637618
}
638619

639-
# Return directories only when the glob end by a slash
640-
# This is needed for posix glob compliance
641-
if ends:
642-
out = {k: v for k, v in out.items() if v["type"] == "directory"}
643-
644620
if detail:
645621
return out
646622
else:

fsspec/tests/abstract/common.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -107,9 +107,9 @@
107107
"subdir1/subfile2",
108108
],
109109
),
110-
("**1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
110+
("**/*1", False, None, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
111111
(
112-
"**1",
112+
"**/*1",
113113
True,
114114
None,
115115
[
@@ -120,14 +120,14 @@
120120
"subdir1/nesteddir/nestedfile",
121121
],
122122
),
123-
("**1", True, 1, ["file1"]),
123+
("**/*1", True, 1, ["file1"]),
124124
(
125-
"**1",
125+
"**/*1",
126126
True,
127127
2,
128128
["file1", "subdir0/subfile1", "subdir1/subfile1", "subdir1/subfile2"],
129129
),
130-
("**1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
130+
("**/*1", False, 2, ["file1", "subdir0/subfile1", "subdir1/subfile1"]),
131131
("**/subdir0", False, None, []),
132132
("**/subdir0", True, None, ["subfile1", "subfile2", "nesteddir/nestedfile"]),
133133
("**/subdir0/nested*", False, 2, []),

0 commit comments

Comments
 (0)