From ed88d05a0c34aaf74e52004534e4e0c67b45ae26 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Thu, 16 May 2024 19:34:00 +0300 Subject: [PATCH 1/3] Propagate slurp through all program input methods Add the `slurp` argument to all `_Program.input*()` methods, and pass it through, so it's possible to supply it with every method. But most importantly the `_Program.input()`, which also accepts `text`. --- jq.pyx | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/jq.pyx b/jq.pyx index 0f580c4..9a00d31 100644 --- a/jq.pyx +++ b/jq.pyx @@ -246,24 +246,24 @@ cdef class _Program(object): self._program_bytes = program_bytes self._jq_state_pool = _JqStatePool(program_bytes, args=args) - def input(self, value=_NO_VALUE, text=_NO_VALUE): + def input(self, value=_NO_VALUE, text=_NO_VALUE, *, slurp=False): if (value is _NO_VALUE) == (text is _NO_VALUE): raise ValueError("Either the value or text argument should be set") if text is not _NO_VALUE: - return self.input_text(text) + return self.input_text(text, slurp=slurp) else: - return self.input_value(value) + return self.input_value(value, slurp=slurp) - def input_value(self, value): - return self.input_text(json.dumps(value)) + def input_value(self, value, *, slurp=False): + return self.input_text(json.dumps(value), slurp=slurp) - def input_values(self, values): + def input_values(self, values, *, slurp=False): fileobj = io.StringIO() for value in values: json.dump(value, fileobj) fileobj.write("\n") - return self.input_text(fileobj.getvalue()) + return self.input_text(fileobj.getvalue(), slurp=slurp) def input_text(self, text, *, slurp=False): return _ProgramWithInput(self._jq_state_pool, text.encode("utf8"), slurp=slurp) From 3580e9d54b4fee84f65fd09c00eae85ab516a45f Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Wed, 9 Sep 2020 17:31:08 +0300 Subject: [PATCH 2/3] Use PyBytes_AsStringAndSize() Let Python give us the length of the "bytes" it already knows, instead of doing an strlen(). This improves performance a bit. --- jq.pyx | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/jq.pyx b/jq.pyx index 9a00d31..09e1ee7 100644 --- a/jq.pyx +++ b/jq.pyx @@ -3,6 +3,7 @@ import json import threading from cpython.bytes cimport PyBytes_AsString +from cpython.bytes cimport PyBytes_AsStringAndSize from libc.float cimport DBL_MAX from libc.math cimport INFINITY, modf @@ -335,8 +336,10 @@ cdef class _ResultIterator(object): self._slurp = slurp self._ready = False cdef jv_parser* parser = jv_parser_new(0) - cdef char* cbytes_input = PyBytes_AsString(bytes_input) - jv_parser_set_buf(parser, cbytes_input, len(bytes_input), 0) + cdef char* cbytes_input + cdef ssize_t clen_input + PyBytes_AsStringAndSize(bytes_input, &cbytes_input, &clen_input) + jv_parser_set_buf(parser, cbytes_input, clen_input, 0) self._parser = parser def __iter__(self): From f91be41322a28975a522e541fe99771b45ac4257 Mon Sep 17 00:00:00 2001 From: Nikolai Kondrashov Date: Thu, 30 May 2024 17:50:32 +0300 Subject: [PATCH 3/3] Support parsing application/json-seq Support parsing RS-separated streams, as per RFC 7464. --- jq.pyx | 53 +++++++++++++++++++++++++++++++++-------------- tests/jq_tests.py | 51 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 88 insertions(+), 16 deletions(-) diff --git a/jq.pyx b/jq.pyx index 09e1ee7..5657419 100644 --- a/jq.pyx +++ b/jq.pyx @@ -19,6 +19,11 @@ cdef extern from "jv.h": JV_KIND_ARRAY, JV_KIND_OBJECT + ctypedef enum: + JV_PARSE_SEQ, + JV_PARSE_STREAMING, + JV_PARSE_STREAM_ERRORS + ctypedef struct jv: pass @@ -49,6 +54,7 @@ cdef extern from "jv.h": jv_parser* jv_parser_new(int) void jv_parser_free(jv_parser*) void jv_parser_set_buf(jv_parser*, const char*, int, int) + int jv_parser_remaining(jv_parser*) jv jv_parser_next(jv_parser*) jv jv_parse(const char*) @@ -267,7 +273,12 @@ cdef class _Program(object): return self.input_text(fileobj.getvalue(), slurp=slurp) def input_text(self, text, *, slurp=False): - return _ProgramWithInput(self._jq_state_pool, text.encode("utf8"), slurp=slurp) + return _ProgramWithInput(self._jq_state_pool, text.encode("utf8"), + slurp=slurp, seq=False) + + def input_text_sequence(self, text, *, slurp=False): + return _ProgramWithInput(self._jq_state_pool, text.encode("utf8"), + slurp=slurp, seq=True) @property def program_string(self): @@ -291,17 +302,20 @@ cdef class _ProgramWithInput(object): cdef _JqStatePool _jq_state_pool cdef object _bytes_input cdef bint _slurp + cdef bint _seq - def __cinit__(self, jq_state_pool, bytes_input, *, bint slurp): + def __cinit__(self, jq_state_pool, bytes_input, *, bint slurp, bint seq): self._jq_state_pool = jq_state_pool self._bytes_input = bytes_input self._slurp = slurp + self._seq = seq def __iter__(self): return self._make_iterator() cdef _ResultIterator _make_iterator(self): - return _ResultIterator(self._jq_state_pool, self._bytes_input, slurp=self._slurp) + return _ResultIterator(self._jq_state_pool, self._bytes_input, + slurp=self._slurp, seq=self._seq) def text(self): # Performance testing suggests that using _jv_to_python (within the @@ -310,6 +324,9 @@ cdef class _ProgramWithInput(object): # See: https://github.com/mwilliamson/jq.py/pull/50 return "\n".join(json.dumps(v) for v in self) + def text_sequence(self): + return "\x1e" + "\n\x1e".join(json.dumps(v) for v in self) + def all(self): return list(self) @@ -329,13 +346,14 @@ cdef class _ResultIterator(object): self._jq_state_pool.release(self._jq) jv_parser_free(self._parser) - def __cinit__(self, _JqStatePool jq_state_pool, bytes bytes_input, *, bint slurp): + def __cinit__(self, _JqStatePool jq_state_pool, bytes bytes_input, *, + bint slurp, bint seq): self._jq_state_pool = jq_state_pool self._jq = jq_state_pool.acquire() self._bytes_input = bytes_input self._slurp = slurp self._ready = False - cdef jv_parser* parser = jv_parser_new(0) + cdef jv_parser* parser = jv_parser_new(JV_PARSE_SEQ if seq else 0) cdef char* cbytes_input cdef ssize_t clen_input PyBytes_AsStringAndSize(bytes_input, &cbytes_input, &clen_input) @@ -384,17 +402,20 @@ cdef class _ResultIterator(object): return 0 cdef inline jv _parse_next_input(self) except *: - cdef jv value = jv_parser_next(self._parser) - if jv_is_valid(value): - return value - elif jv_invalid_has_msg(jv_copy(value)): - error_message = jv_invalid_get_msg(value) - message = jv_string_to_py_string(error_message) - jv_free(error_message) - raise ValueError(u"parse error: " + message) - else: - jv_free(value) - raise StopIteration() + cdef jv value + while True: + value = jv_parser_next(self._parser) + if jv_is_valid(value): + return value + elif jv_invalid_has_msg(jv_copy(value)): + error_message = jv_invalid_get_msg(value) + message = jv_string_to_py_string(error_message) + jv_free(error_message) + raise ValueError(u"parse error: " + message) + else: + if not jv_parser_remaining(self._parser): + jv_free(value) + raise StopIteration() def all(program, value=_NO_VALUE, text=_NO_VALUE): diff --git a/tests/jq_tests.py b/tests/jq_tests.py index 62b3ae0..da724dc 100644 --- a/tests/jq_tests.py +++ b/tests/jq_tests.py @@ -241,6 +241,57 @@ def test_unicode_strings_can_be_used_as_input(): ) +def test_record_separator_character_accepted_in_input(): + assert_equal( + [], + list(jq.compile(".").input_text_sequence('\x1e')) + ) + assert_equal( + [], + list(jq.compile(".").input_text_sequence('\x1e\x1e')) + ) + assert_equal( + [{}], + list(jq.compile(".").input_text_sequence('\x1e{}')) + ) + assert_equal( + [{}], + list(jq.compile(".").input_text_sequence('\x1e\x1e{}')) + ) + assert_equal( + [], + list(jq.compile(".").input_text_sequence('{}\x1e')) + ) + assert_equal( + [], + list(jq.compile(".").input_text_sequence('{}\x1e\x1e')) + ) + assert_equal( + [{}], + list(jq.compile(".").input_text_sequence('\x1e{}\x1e')) + ) + assert_equal( + [[]], + list(jq.compile(".").input_text_sequence('{}\x1e[]')) + ) + assert_equal( + [[]], + list(jq.compile(".").input_text_sequence('{}\x1e\x1e[]')) + ) + assert_equal( + [{},[]], + list(jq.compile(".").input_text_sequence('\x1e{}\x1e[]')) + ) + assert_equal( + [[]], + list(jq.compile(".").input_text_sequence('{}\x1e[]\x1e')) + ) + assert_equal( + [{},[]], + list(jq.compile(".").input_text_sequence('\x1e{}\x1e[]\x1e')) + ) + + def test_unicode_strings_can_be_used_as_programs(): assert_equal( "Dragon‽",