/** * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #ifndef ORC_VECTOR_HH #define ORC_VECTOR_HH #include "orc/orc-config.hh" #include "MemoryPool.hh" #include "Int128.hh" #include #include #include #include #include #include #include namespace orc { /** * The base class for each of the column vectors. This class handles * the generic attributes such as number of elements, capacity, and * notNull vector. */ struct ColumnVectorBatch { ColumnVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~ColumnVectorBatch(); // the number of slots available uint64_t capacity; // the number of current occupied slots uint64_t numElements; // an array of capacity length marking non-null values DataBuffer notNull; // whether there are any null values bool hasNulls; // whether the vector batch is encoded bool isEncoded; // custom memory pool MemoryPool& memoryPool; /** * Generate a description of this vector as a string. */ virtual std::string toString() const = 0; /** * Change the number of slots to at least the given capacity. * This function is not recursive into subtypes. */ virtual void resize(uint64_t capacity); /** * Empties the vector from all its elements, recursively. * Do not alter the current capacity. */ virtual void clear(); /** * Heap memory used by the batch. */ virtual uint64_t getMemoryUsage(); /** * Check whether the batch length varies depending on data. */ virtual bool hasVariableLength(); private: ColumnVectorBatch(const ColumnVectorBatch&); ColumnVectorBatch& operator=(const ColumnVectorBatch&); }; struct LongVectorBatch: public ColumnVectorBatch { LongVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~LongVectorBatch(); DataBuffer data; std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); }; struct DoubleVectorBatch: public ColumnVectorBatch { DoubleVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~DoubleVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); DataBuffer data; }; struct StringVectorBatch: public ColumnVectorBatch { StringVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~StringVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); // pointers to the start of each string DataBuffer data; // the length of each string DataBuffer length; // string blob DataBuffer blob; }; struct StringDictionary { StringDictionary(MemoryPool& pool); DataBuffer dictionaryBlob; // Offset for each dictionary key entry. DataBuffer dictionaryOffset; void getValueByIndex(int64_t index, char*& valPtr, int64_t& length) { if (index < 0 || static_cast(index) + 1 >= dictionaryOffset.size()) { throw std::out_of_range("index out of range."); } int64_t* offsetPtr = dictionaryOffset.data(); valPtr = dictionaryBlob.data() + offsetPtr[index]; length = offsetPtr[index + 1] - offsetPtr[index]; } }; /** * Include a index array with reference to corresponding dictionary. * User first obtain index from index array and retrieve string pointer * and length by calling getValueByIndex() from dictionary. */ struct EncodedStringVectorBatch : public StringVectorBatch { EncodedStringVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~EncodedStringVectorBatch(); std::string toString() const; void resize(uint64_t capacity); std::shared_ptr dictionary; // index for dictionary entry DataBuffer index; }; struct StructVectorBatch: public ColumnVectorBatch { StructVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~StructVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); bool hasVariableLength(); std::vector fields; }; struct ListVectorBatch: public ColumnVectorBatch { ListVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~ListVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); bool hasVariableLength(); /** * The offset of the first element of each list. * The length of list i is offsets[i+1] - offsets[i]. */ DataBuffer offsets; // the concatenated elements ORC_UNIQUE_PTR elements; }; struct MapVectorBatch: public ColumnVectorBatch { MapVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~MapVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); bool hasVariableLength(); /** * The offset of the first element of each map. * The size of map i is offsets[i+1] - offsets[i]. */ DataBuffer offsets; // the concatenated keys ORC_UNIQUE_PTR keys; // the concatenated elements ORC_UNIQUE_PTR elements; }; struct UnionVectorBatch: public ColumnVectorBatch { UnionVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~UnionVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); bool hasVariableLength(); /** * For each value, which element of children has the value. */ DataBuffer tags; /** * For each value, the index inside of the child ColumnVectorBatch. */ DataBuffer offsets; // the sub-columns std::vector children; }; struct Decimal { Decimal(const Int128& value, int32_t scale); explicit Decimal(const std::string& value); Decimal(); std::string toString(bool trimTrailingZeros = false) const; Int128 value; int32_t scale; }; struct Decimal64VectorBatch: public ColumnVectorBatch { Decimal64VectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~Decimal64VectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); // total number of digits int32_t precision; // the number of places after the decimal int32_t scale; // the numeric values DataBuffer values; protected: /** * Contains the scales that were read from the file. Should NOT be * used. */ DataBuffer readScales; friend class Decimal64ColumnReader; friend class Decimal64ColumnWriter; }; struct Decimal128VectorBatch: public ColumnVectorBatch { Decimal128VectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~Decimal128VectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); // total number of digits int32_t precision; // the number of places after the decimal int32_t scale; // the numeric values DataBuffer values; protected: /** * Contains the scales that were read from the file. Should NOT be * used. */ DataBuffer readScales; friend class Decimal128ColumnReader; friend class DecimalHive11ColumnReader; friend class Decimal128ColumnWriter; }; /** * A column vector batch for storing timestamp values. * The timestamps are stored split into the time_t value (seconds since * 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value. */ struct TimestampVectorBatch: public ColumnVectorBatch { TimestampVectorBatch(uint64_t capacity, MemoryPool& pool); virtual ~TimestampVectorBatch(); std::string toString() const; void resize(uint64_t capacity); void clear(); uint64_t getMemoryUsage(); // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t) // Note that we always assume data is in GMT timezone; therefore it is // user's responsibility to convert wall clock time in local timezone // to GMT. DataBuffer data; // the nanoseconds of each value DataBuffer nanoseconds; }; } #endif