Skip to content

Commit

Permalink
stream test
Browse files Browse the repository at this point in the history
  • Loading branch information
synodriver committed Apr 6, 2024
1 parent 7e40642 commit 998072b
Show file tree
Hide file tree
Showing 6 changed files with 2,989 additions and 1,452 deletions.
2 changes: 1 addition & 1 deletion base16384
4,213 changes: 2,775 additions & 1,438 deletions pybase16384/backends/cython/_core.c

Large diffs are not rendered by default.

99 changes: 87 additions & 12 deletions pybase16384/backends/cython/_core.pyx
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
# cython: language_level=3
# cython: cdivision=True
from cpython.bytes cimport PyBytes_AS_STRING, PyBytes_Check, PyBytes_Size
from cpython.bytes cimport (PyBytes_AS_STRING, PyBytes_Check,
PyBytes_FromStringAndSize, PyBytes_GET_SIZE)
from cpython.mem cimport PyMem_Free, PyMem_Malloc
from cpython.object cimport PyObject_HasAttrString
from cpython.object cimport PyObject, PyObject_HasAttrString
from libc.stdint cimport int32_t, uint8_t
from libc.string cimport memcpy

from pybase16384.backends.cython.base16384 cimport (
BASE16384_DECBUFSZ, BASE16384_ENCBUFSZ,
BASE16384_FLAG_DO_SUM_CHECK_FORCELY, BASE16384_FLAG_NOHEADER,
BASE16384_FLAG_SUM_CHECK_ON_REMAIN, b14_decode, b14_decode_fd,
b14_decode_fd_detailed, b14_decode_file, b14_decode_file_detailed,
b14_decode_len, b14_decode_safe, b14_encode, b14_encode_fd,
b14_decode_len, b14_decode_safe, b14_decode_stream,
b14_decode_stream_detailed, b14_encode, b14_encode_fd,
b14_encode_fd_detailed, b14_encode_file, b14_encode_file_detailed,
b14_encode_len, b14_encode_safe, base16384_err_fopen_input_file,
b14_encode_len, b14_encode_safe, b14_encode_stream,
b14_encode_stream_detailed, base16384_err_fopen_input_file,
base16384_err_fopen_output_file, base16384_err_get_file_size,
base16384_err_invalid_commandline_parameter,
base16384_err_invalid_decoding_checksum, base16384_err_invalid_file_name,
base16384_err_invalid_decoding_checksum, base16384_err_invalid_file_name,base16384_io_function_t,
base16384_err_map_input_file, base16384_err_ok,
base16384_err_open_input_file, base16384_err_read_file, base16384_err_t,
base16384_err_write_file, pybase16384_64bits)
base16384_err_write_file, base16384_stream_t, pybase16384_64bits)

from pathlib import Path

Expand Down Expand Up @@ -189,7 +193,7 @@ def encode_file(object input,
first_check = 0
if not PyBytes_Check(chunk):
raise TypeError(f"input must be a file-like rb object, got {type(input).__name__}")
size = PyBytes_Size(chunk)
size = PyBytes_GET_SIZE(chunk)
if <int32_t> size < current_buf_len: # 数据不够了 要减小一次读取的量
if buf_rate > 1: # 重新设置一次读取的大小 重新设置流的位置 当然要是已经是一次读取7字节了 那就不能再变小了 直接encode吧
buf_rate = buf_rate / 2
Expand Down Expand Up @@ -236,7 +240,7 @@ def encode_file_safe(object input,
first_check = 0
if not PyBytes_Check(chunk):
raise TypeError(f"input must be a file-like rb object, got {type(input).__name__}")
size = PyBytes_Size(chunk)
size = PyBytes_GET_SIZE(chunk)
if <int32_t> size < current_buf_len: # 数据不够了 要减小一次读取的量
if buf_rate > 1: # 重新设置一次读取的大小 重新设置流的位置 当然要是已经是一次读取7字节了 那就不能再变小了 直接encode吧
buf_rate = buf_rate / 2
Expand Down Expand Up @@ -281,7 +285,7 @@ def decode_file(object input,
try:
while True:
chunk = input.read(current_buf_len) # 8的倍数
size = PyBytes_Size(chunk)
size = PyBytes_GET_SIZE(chunk)
if size == 0:
break
if <int32_t> size < current_buf_len: # 长度不够了
Expand All @@ -291,7 +295,7 @@ def decode_file(object input,
input.seek(-size, 1)
continue
tmp = input.read(2) # type: bytes
if PyBytes_Size(tmp) == 2:
if PyBytes_GET_SIZE(tmp) == 2:
if tmp[0] == 61: # = stream完了 一次解码8n+2个字节
chunk += tmp
size += 2
Expand Down Expand Up @@ -333,7 +337,7 @@ def decode_file_safe(object input,
try:
while True:
chunk = input.read(current_buf_len) # 8的倍数
size = PyBytes_Size(chunk)
size = PyBytes_GET_SIZE(chunk)
if size == 0:
break
if <int32_t> size < current_buf_len: # 长度不够了
Expand All @@ -343,7 +347,7 @@ def decode_file_safe(object input,
input.seek(-size, 1)
continue
tmp = input.read(2) # type: bytes
if PyBytes_Size(tmp) == 2:
if PyBytes_GET_SIZE(tmp) == 2:
if tmp[0] == 61: # = stream完了 一次解码8n+2个字节
chunk += tmp
size += 2
Expand Down Expand Up @@ -541,3 +545,74 @@ cpdef inline decode_fd_detailed(int inp, int out, int flag):
finally:
PyMem_Free(encbuf)
PyMem_Free(decbuf)

# stream
cdef ssize_t b14_readcallback(const void *client_data, void *buffer, size_t count) except -100 with gil:
cdef object file = <object><PyObject*>client_data
cdef bytes data = file.read(count)
cdef char* data_ptr = PyBytes_AS_STRING(data)
cdef ssize_t data_size = <ssize_t>PyBytes_GET_SIZE(data)
memcpy(buffer, data_ptr, <size_t>data_size)
return data_size

cdef ssize_t b14_writecallback(const void *client_data, const void *buffer, size_t count) except -100 with gil:
cdef object file = <object> <PyObject *> client_data
cdef bytes data = PyBytes_FromStringAndSize(<char*>buffer, <Py_ssize_t>count)
cdef ssize_t ret = <ssize_t>file.write(data)
return ret

cpdef inline encode_stream_detailed(object inp, object out, int flag):
cdef char * encbuf = <char *> PyMem_Malloc(<size_t> BASE16384_ENCBUFSZ)
if encbuf == NULL:
raise MemoryError
cdef char * decbuf = <char *> PyMem_Malloc(<size_t> BASE16384_DECBUFSZ)
if decbuf == NULL:
PyMem_Free(encbuf)
raise MemoryError

cdef base16384_err_t ret

cdef base16384_stream_t inpstream = base16384_stream_t(f=base16384_io_function_t(reader=b14_readcallback),
client_data=<const void *> inp)
# inpstream.f.reader = b14_readcallback
# inpstream.client_data = <const void*>inp

cdef base16384_stream_t outstream = base16384_stream_t(f=base16384_io_function_t(writer=b14_writecallback),
client_data=<const void *> out)
# outstream.f.writer = b14_writecallback
# outstream.client_data = <const void*>out
try:
with nogil:
ret = b14_encode_stream_detailed(&inpstream, &outstream, encbuf, decbuf, flag)
if ret != base16384_err_ok:
raise ValueError(err_to_str(ret))
finally:
PyMem_Free(encbuf)
PyMem_Free(decbuf)

cpdef inline decode_stream_detailed(object inp, object out, int flag):
cdef char * encbuf = <char *> PyMem_Malloc(<size_t> BASE16384_ENCBUFSZ)
if encbuf == NULL:
raise MemoryError
cdef char * decbuf = <char *> PyMem_Malloc(<size_t> BASE16384_DECBUFSZ)
if decbuf == NULL:
PyMem_Free(encbuf)
raise MemoryError

cdef base16384_err_t ret

cdef base16384_stream_t inpstream = base16384_stream_t(f=base16384_io_function_t(reader=b14_readcallback),client_data= <const void*>inp)
# inpstream.f.reader = b14_readcallback
# inpstream.client_data = <const void*>inp

cdef base16384_stream_t outstream = base16384_stream_t(f=base16384_io_function_t(writer=b14_writecallback),client_data= <const void*>out)
# outstream.f.writer = b14_writecallback
# outstream.client_data = <const void*>out
try:
with nogil:
ret = b14_decode_stream_detailed(&inpstream, &outstream, encbuf, decbuf, flag)
if ret != base16384_err_ok:
raise ValueError(err_to_str(ret))
finally:
PyMem_Free(encbuf)
PyMem_Free(decbuf)
17 changes: 16 additions & 1 deletion pybase16384/backends/cython/base16384.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,22 @@ cdef extern from "base16384.h" nogil:
# base16384_decode_fd_detailed decodes input fd to output fd.
# encbuf & decbuf must be no less than BASE16384_ENCBUFSZ & BASE16384_DECBUFSZ
base16384_err_t b14_decode_fd_detailed "base16384_decode_fd_detailed" (int input, int output, char* encbuf, char* decbuf, int flag)

# stream
ctypedef ssize_t (*base16384_reader_t) (const void *client_data, void *buffer, size_t count) except -100
ctypedef ssize_t (*base16384_writer_t) (const void *client_data, const void *buffer, size_t count) except -100

ctypedef union base16384_io_function_t:
base16384_reader_t reader
base16384_writer_t writer

ctypedef struct base16384_stream_t:
const base16384_io_function_t f
const void* client_data

base16384_err_t b14_encode_stream "base16384_encode_stream"(base16384_stream_t* input, base16384_stream_t* output, char* encbuf, char* decbuf)
base16384_err_t b14_encode_stream_detailed "base16384_encode_stream_detailed"(base16384_stream_t* input, base16384_stream_t* output, char* encbuf, char* decbuf, int flag)
base16384_err_t b14_decode_stream "base16384_decode_stream"(base16384_stream_t* input, base16384_stream_t* output, char* encbuf, char* decbuf)
base16384_err_t b14_decode_stream_detailed "base16384_decode_stream_detailed"(base16384_stream_t* input, base16384_stream_t* output, char* encbuf, char* decbuf, int flag)

cdef extern from * nogil:
"""
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def build_extensions(self):
"pybase16384/backends/cython/_core.pyx",
f"./base16384/base14{CPUBIT}.c",
"./base16384/file.c",
"./base16384/wrap.c",
],
include_dirs=[f"./base16384"],
library_dirs=[f"./base16384"],
Expand Down
109 changes: 109 additions & 0 deletions tests/test_bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# -*- coding: utf-8 -*
# From https://gist.github.com/mnixry/3608d0207196b847887d34ace8feeb87
encoding, decoding = (
lambda input_bytes: (
"".join(
chr(0x4E00 + sum(1 << i for i, bit in enumerate(reversed(row)) if bit))
for row in (
lambda x, length: (
tuple(next(it, None) for it in x) for _ in range(length)
)
)(
[((char >> i) & 1 for char in input_bytes for i in reversed(range(8)))]
* 14,
len(input_bytes) * 4 // 7 + (1 if len(input_bytes) * 4 % 7 else 0),
)
)
+ (chr(0x3D00 + len(input_bytes) % 7) if len(input_bytes) % 7 else "")
),
lambda input_string: bytes(
sum(1 << i for i, bit in enumerate(reversed(row)) if bit)
for row in (
lambda raw_string, residue: (
lambda x, length: (
tuple(next(it, None) for it in x) for _ in range(length)
)
)(
[
(
(char >> i) & 1
for char in (ord(i) - 0x4E00 for i in raw_string)
for i in reversed(range(14))
)
]
* 8,
(len(raw_string) - 1) // 4 * 7 + (residue or 7),
)
)(
input_string[
: None
if input_string and 0x4E00 <= ord(input_string[-1]) <= 0x8DFF
else -1
],
(
ord(input_string[-1]) - 0x3D00
if input_string and 0 <= ord(input_string[-1]) - 0x3D00 < 8
else 0
),
)
),
)

import sys

sys.path.append(".")
import time
import unittest
from random import randint
from unittest import TestCase

import pybase16384 as bs


class TestBench(TestCase):
def test_bench(self):
start = time.time()
value = b"="
for i in range(1000):
value += b"x"
self.assertEqual(bs.decode_safe(bs.encode_safe(value)), value)
end = time.time()
print(f"C extension tooks {end - start} seconds")

start = time.time()
value = b"="
for i in range(1000):
value += b"x"
self.assertEqual(decoding(encoding(value)), value)
end = time.time()
print(f"Pure python tooks {end - start} seconds")

def test_bench2(self):
value = b"xxx" * 1000000
start = time.time()
self.assertEqual(bs.decode_safe(bs.encode_safe(value)), value)
end = time.time()
ctime = end - start
print(f"C extension tooks {ctime} seconds")

start = time.time()
buffer = bytearray(bs.encode_len(len(value)))
buffer_updated = bs._encode_into_safe(value, buffer)
buffer2 = bytearray(bs.decode_len(buffer_updated, 0))
buffer_updated2 = bs._decode_into_safe(buffer[:buffer_updated], buffer2)
end = time.time()
self.assertEqual(bytes(buffer2[:buffer_updated2]), value)
cztime = end - start
print(f"C extension zerocopy tooks {cztime} seconds")

start = time.time()
self.assertEqual(decoding(encoding(value)), value)
end = time.time()
pytime = end - start
print(f"Pure python tooks {pytime} seconds")

print(f"C extension zerocopy is {pytime/cztime} times faster than pure python")


if __name__ == "__main__":
unittest.main()

0 comments on commit 998072b

Please sign in to comment.