feat(builtins/method): bytes/bytearray: join

litlighilit · litlighilit · commit fce6a89722f7 · 2025-10-15T19:33:08.000+08:00
diff --git a/Objects/byteobjects.nim b/Objects/byteobjects.nim
@@ -5,6 +5,7 @@ import ./pyobject
 from ./abstract/iter import PyObject_GetIter
 import ./[listobject, tupleobjectImpl, stringobject, exceptions, iterobject]
 import ./numobjects/intobject/[decl, ops_imp_warn]
+import ../Utils/addr0
 #XXX: Nim's string ops has bugs for NUL('\0') char, e.g. len('1\02') gives 2
 declarePyType Bytes(tpToken):
   items: seq[char]
@@ -54,15 +55,30 @@ proc contains*(s: PyByteLike, c: char): bool = c in s.items
 proc `[]`*(s: PyByteLike, i: int): char = s.items[i]
 proc getInt*(s: PyByteLike, i: int): PyIntObject = newPyInt s[i]
 
-template impl(B, InitT, newTOfCap){.dirty.} =
+when defined(js):
+  type CharsView* = seq[char]
+else:
+  type CharsView* = cstring  ## impl is unstable. It's UB if setitem to PyBytes's CharsView
+  ## and in JS backend, currently it's just a copy, not a real view
+  proc getCharPtr*(s: PyByteLike; i: int): ptr char = addr s.items[i]  ## unstable.
+  ##  not available on JS
 
+template impl(B, InitT, newTOfCap){.dirty.} =
   proc asString*(s: `Py B Object`): string = $s.items
+  proc charsView*(s: `Py B Object`): CharsView =
+    when defined(js): s.items
+    else:
+      return cast[cstring](s.items.addr0)
   method `$`*(s: `Py B Object`): string = s.asString
-  proc `newPy B`*(s: InitT = default InitT): `Py B Object` =
+  proc `newPy B`*(s: InitT): `Py B Object` =
     result = `newPy B Simple`()
     result.items = s
   proc `newPy B`*(size: int): `Py B Object` =
     `newPy B` newTOfCap size
+
+  let `empty B` = `newPy B` @[]
+  proc `newPy B`*(): `Py B Object` = `empty B`
+
   proc `&`*(s1, s2: `Py B Object`): `Py B Object` =
     `newPy B`(s1.items & s2.items)
 
diff --git a/Objects/byteobjectsImpl.nim b/Objects/byteobjectsImpl.nim
@@ -1,12 +1,13 @@
 
 import std/strformat
-import ../Utils/[sequtils, addr0]
+import ../Utils/[sequtils, destroyPatch, addr0]
 import ./byteobjects
 import ./pyobject
 import ./[boolobject, numobjects, stringobjectImpl, exceptions, noneobject,
   iterobject, hash, abstract,
 ]
 import ./tupleobjectImpl
+import ./stringlib/join
 from ./listobject import genMutableSequenceMethods
 
 export byteobjects
@@ -114,6 +115,27 @@ genMutableSequenceMethods PyNumber_AsCharOrRet, newPyInt, ByteArray, char:
     if self.len == high int:
       return newOverflowError newPyAscii"cannot add more objects to bytearray"
 
+#TODO:buffer
+# workaround:
+type Py_buffer = object
+  buf: CharsView
+  len: int
+  obj: PyObject
+defdestroy Py_buffer: discard
+#proc PyBuffer_Release(b: Py_buffer) = discard
+
+proc init_Py_buffer(buf: CharsView, len: int, obj: PyObject, ): Py_buffer = Py_buffer(buf: buf, len: len, obj: obj)
+
+proc to_py_buffer(b: PyBytesObject|PyByteArrayObject): CharsView = b.charsView
+
+template genJoin(B; mut: bool){.dirty.} =
+  proc join*(b: `Py B Object`, iterable: PyObject): PyObject{.pyCFuncPragma.} =
+    bytes_join B, b, iterable, mutable=mut
+  `impl B Method` join(iterable): self.join iterable
+
+genJoin bytes, false
+genJoin bytearray, true
+
 template impl(x, fromSize, fromObject) =
   if x.ofPyStrObject:
     return newTypeError newPyAscii"string argument without an encoding"
diff --git a/Objects/stringlib/join.nim b/Objects/stringlib/join.nim
@@ -0,0 +1,137 @@
+
+import std/strformat
+import ../[
+  pyobjectBase,
+  stringobject,
+  exceptions,
+]
+import ../abstract/sequence/list
+import ../../Utils/rtarrays
+
+template bytes_join*(S; sep; iterable: PyObject; mutable: bool)#[: PyObject]#{.dirty.} =
+  bind RtArray, initRtArray
+  bind PySequence_Fast, PySequence_Fast_GET_SIZE, PySequence_Fast_GET_ITEM
+  bind newPyStr, newPyAscii, newRuntimeError, newTypeError, newOverflowError
+  bind formatValue, fmt
+  let
+    sepstr = sep.charsView
+    seplen = len(sep)
+
+  let sequ = PySequence_Fast(iterable, "can only join an iterable")
+  retIfExc sequ
+
+  let seqlen = PySequence_Fast_GET_SIZE(sequ)
+  if seqlen == 0:
+    return `newPy S`()
+
+  var item: PyObject
+  when not mutable:
+    if seqlen == 1:
+      item = PySequence_Fast_GET_ITEM(sequ, 0)
+      if item.`ofExactPy S Object`:
+        return item
+
+  const GIL_THRESHOLD = 1048576
+
+  #XXX: NIM-BUG: when JS using RtArray: `Error: internal error: ("genAddr: 2", skTemp)`
+  # due to `[]=` or `[]` to RtArray
+  var buffers = (when defined(js): newSeq else: initRTArray)[Py_buffer](seqlen)
+
+
+  #[ Here is the general case.  Do a pre-pass to figure out the total
+    amount of space we'll need (sz), and see whether all arguments are
+    bytes-like.
+    ]#
+  var sz = 0
+  var nbufs = 0
+  var drop_gil = true
+  for i in 0 ..< seqlen:
+    item = PySequence_Fast_GET_ITEM(sequ, i)
+    proc asgn(b: auto) =
+      buffers[i] = init_Py_buffer(to_py_buffer(b), b.len, item)
+    if item.ofExactPyBytesObject:
+      # Fast path.
+      let b = PyBytesObject(item)
+      asgn b
+    elif item.ofExactPyByteArrayObject:
+      let b = PyByteArrayObject(item)
+      asgn b
+    else:
+      template byteslikeExpect =
+        return newTypeError newPyStr(
+          fmt"sequence item {i}: expected a bytes-like object, {item.typeName:.80s} found"
+        )
+      when defined(npython_buffer):
+        #TODO:buffer
+        let exc: PyBaseErrorObject = PyObject_GetBuffer(item, buffers[i], PyBUF.SIMPLE)
+        if not exc.isNil:
+          byteslikeExpect
+        #[ If the backing objects are mutable, then dropping the GIL
+          opens up race conditions where another thread tries to modify
+          the object which we hold a buffer on it. Such code has data
+          races anyway, but this is a conservative approach that avoids
+          changing the behaviour of that data race.
+          ]#
+        drop_gil = false
+      else:
+        byteslikeExpect
+
+    nbufs = i + 1  # for error cleanup
+    let itemlen = buffers[i].len
+    template resTooLong =
+      return newOverflowError newPyAscii"join() result is too long"
+    template `+?=`(s: var int; i: int) =
+      if i > int.high - s: resTooLong
+      s += i
+    sz +?= itemlen
+    if i != 0:
+      sz +?= seplen
+    if seqlen != PySequence_Fast_GET_SIZE(sequ):
+      return newRuntimeError newPyAscii"sequence changed size during iteration"
+
+  # Allocate result space.
+  var res = `newPy S`(sz)
+
+  # Catenate everything.
+  var p = 0
+  when declared(copyMem):
+    template memcpy(_, b; n: int) = copyMem(res.getCharPtr p, b[0].addr, n)
+  else:
+    template memcpy(_, b; n: int) =
+      for i in 0..<n:#p ..< p+n:
+        res.items[p+i] = b[i]
+  template addbn(b; n: int) =
+    memcpy(res[p], b, n)
+    p += n
+  template addb(bExpr: Py_buffer) =
+    let b = bExpr
+    addbn(b.buf, b.len)
+  if sz < GIL_THRESHOLD:
+    drop_gil = false   # Benefits are likely outweighed by the overheads
+  
+  #TODO:threads
+  const hasPyThrd = defined(npython_threads)
+  when hasPyThrd:
+    var save: PyThreadState
+    if drop_gil: save = PyEval_SaveThread()
+
+  if seplen == 0:
+    # fast path
+    for i in 0..<nbufs:
+      addb buffers[i]
+  else:
+    if nbufs > 0:
+      addb(buffers[0])
+      addbn(sepstr, seplen)
+      for i in 1 ..< nbufs:
+        addb(buffers[i])
+
+  when hasPyThrd:
+    if drop_gil: PyEval_RestoreThread(save)
+
+  # RtArray's `=destroy` will call buffer's destroy
+  #for b in buffers: PyBuffer_Release(b)
+  #if use_non_static: PyMem_Free(buffers)
+  return res
+
+