Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added RLEBigPackingHybrid encode function #611

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/lib/dremel/src/Flow/Dremel/Dremel.php
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ private function assertInput(array $repetitions, array $definitions) : void
{
if (\count($repetitions) !== 0) {
if (\count(\array_unique([\count($repetitions), \count($definitions)])) !== 1) {
throw new InvalidArgumentException('repetitions, definitions and values count must be exactly the same');
throw new InvalidArgumentException('repetitions, definitions and values count must be exactly the same, repetitions: ' . \count($repetitions) . ', definitions: ' . \count($definitions));
}
}

Expand Down
43 changes: 43 additions & 0 deletions src/lib/parquet/src/Flow/Parquet/BinaryWriter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
<?php declare(strict_types=1);

namespace Flow\Parquet;

interface BinaryWriter
{
public function length() : DataSize;

/**
* @param array<int> $bits
*/
public function writeBits(array $bits) : void;

/**
* @param array<bool> $values
*/
public function writeBooleans(array $values) : void;

/**
* @param array<int> $bytes
*/
public function writeBytes(array $bytes) : void;

/**
* @param array<int> $ints
*/
public function writeInts32(array $ints) : void;

/**
* @param array<int> $ints
*/
public function writeInts64(array $ints) : void;

/**
* @param array<string> $strings
*/
public function writeStrings(array $strings) : void;

/**
* @param array<int> $values
*/
public function writeVarInts32(array $values) : void;
}
119 changes: 119 additions & 0 deletions src/lib/parquet/src/Flow/Parquet/BinaryWriter/BinaryBufferWriter.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
<?php declare(strict_types=1);

namespace Flow\Parquet\BinaryWriter;

use Flow\Parquet\BinaryWriter;
use Flow\Parquet\ByteOrder;
use Flow\Parquet\DataSize;

final class BinaryBufferWriter implements BinaryWriter
{
private DataSize $length;

public function __construct(private string &$buffer, private readonly ByteOrder $byteOrder = ByteOrder::LITTLE_ENDIAN)
{
$this->buffer = '';
$this->length = new DataSize(0);
}

public function length() : DataSize
{
return $this->length;
}

public function writeBits(array $bits) : void
{
$byte = 0;
$bitIndex = 0;

foreach ($bits as $bit) {
if ($bit) {
$byte |= (1 << $bitIndex);
}

$bitIndex++;

if ($bitIndex === 8) {
$this->buffer .= \chr($byte);
$this->length->addBytes(1); // Assume addBytes is a method to add to the length
$byte = 0;
$bitIndex = 0;
}
}

// If there are remaining bits that don't fill a byte
if ($bitIndex > 0) {
$this->buffer .= \chr($byte);
$this->length->addBytes(1);
}
}

public function writeBooleans(array $values) : void
{
$bits = [];

foreach ($values as $value) {
$bits[] = $value ? 1 : 0;
}
$this->writeBits($bits);
}

public function writeBytes(array $bytes) : void
{
foreach ($bytes as $byte) {
$this->buffer .= \chr($byte);
}
$this->length->addBytes(\count($bytes));
}

public function writeInts32(array $ints) : void
{
$format = $this->byteOrder === ByteOrder::BIG_ENDIAN ? 'N' : 'V';

foreach ($ints as $int) {
$this->buffer .= \pack($format, $int);
}
$this->length->addBytes(\count($ints) * 4);
}

public function writeInts64(array $ints) : void
{
$format = $this->byteOrder === ByteOrder::BIG_ENDIAN ? 'J' : 'P';

foreach ($ints as $int) {
$this->buffer .= \pack($format, $int);
}
$this->length->addBytes(\count($ints) * 8);
norberttech marked this conversation as resolved.
Show resolved Hide resolved
}

/**
* @param array<string> $strings
*/
public function writeStrings(array $strings) : void
{
$format = $this->byteOrder === ByteOrder::BIG_ENDIAN ? 'N' : 'V';

foreach ($strings as $string) {
$length = \strlen($string);
$this->buffer .= \pack($format, $length);
$this->buffer .= $string;
}
$this->length->addBytes(\array_sum(\array_map('strlen', $strings)) + (4 * \count($strings)));
norberttech marked this conversation as resolved.
Show resolved Hide resolved
}

public function writeVarInts32(array $values) : void
{
foreach ($values as $value) {
do {
$temp = $value & 0x7F;
$value >>= 7;

if ($value) {
$temp |= 0x80;
}
$this->buffer .= \chr($temp);
$this->length->addBytes(1);
} while ($value);
}
}
}
15 changes: 15 additions & 0 deletions src/lib/parquet/src/Flow/Parquet/DataSize.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ public function __construct(private int $bits)
{
}

public static function fromBytes(int $bytes) : self
{
return new self($bytes * 8);
}

public function add(int|self $bits) : void
{
if ($bits instanceof self) {
Expand All @@ -23,6 +28,11 @@ public function add(int|self $bits) : void
$this->bytes = (int) \round($this->bits / 8, 0, PHP_ROUND_HALF_DOWN);
}

public function addBytes(int $bytes) : void
{
$this->add($bytes * 8);
}

public function bits() : int
{
return $this->bits;
Expand All @@ -49,4 +59,9 @@ public function sub(int|self $bits) : void
$this->bits -= $bits;
$this->bytes = (int) \round($this->bits / 8, 0, PHP_ROUND_HALF_DOWN);
}

public function subBytes(int $bytes) : void
{
$this->sub($bytes * 8);
}
}
44 changes: 44 additions & 0 deletions src/lib/parquet/src/Flow/Parquet/ParquetFile/Data/BitWidth.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php declare(strict_types=1);

namespace Flow\Parquet\ParquetFile\Data;

final class BitWidth
{
public static function calculate(int $value) : int
{
return (int) \ceil(\log($value + 1, 2));
}

/**
* @param array<int> $ints
*/
public static function fromArray(array $ints) : int
{
if (!\count($ints)) {
return 0;
}

$maxInt = \max($ints);

if ($maxInt === 0) {
return 0;
}

return self::calculate($maxInt);
}

/**
* @return array<int>
*/
public static function toBytes(int $value, int $bitWidth) : array
{
$bytes = [];
$width = (int) (($bitWidth + 7) / 8);

for ($i = 0; $i < $width; $i++) {
$bytes[] = ($value >> ($i * 8)) & 0xFF;
}

return $bytes;
}
}
Loading
Loading