12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550 |
- // Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>
- // License: BSD-3
- //
- // All rights reserved.
- //
- // Redistribution and use in source and binary forms, with or without
- // modification, are permitted provided that the following conditions are met:
- //
- // 1. Redistributions of source code must retain the above copyright notice,
- // this list of conditions and the following disclaimer.
- //
- // 2. Redistributions in binary form must reproduce the above copyright notice,
- // this list of conditions and the following disclaimer in the documentation
- // and/or other materials provided with the distribution.
- //
- // 3. Neither the name of the copyright holder nor the names of its contributors
- // may be used to endorse or promote products derived from this software
- // without specific prior written permission.
- //
- // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
- // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
- // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
- // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
- // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
- // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
- // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
- // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- // POSSIBILITY OF SUCH DAMAGE.
- #ifndef CSV_H
- #define CSV_H
- #include <algorithm>
- #include <cstdio>
- #include <cstring>
- #include <exception>
- #include <string>
- #include <utility>
- #include <vector>
- #ifndef CSV_IO_NO_THREAD
- #include <condition_variable>
- #include <mutex>
- #include <thread>
- #endif
- #include <cassert>
- #include <cerrno>
- #include <istream>
- #include <limits>
- #include <memory>
- #include <fstream>
- namespace io
- {
- ////////////////////////////////////////////////////////////////////////////
- // LineReader //
- ////////////////////////////////////////////////////////////////////////////
- namespace error
- {
- struct base : std::exception
- {
- virtual void format_error_message() const = 0;
- const char *what() const noexcept override
- {
- format_error_message();
- return error_message_buffer;
- }
- mutable char error_message_buffer[2048];
- };
- // this only affects the file name in the error message
- const int max_file_name_length = 1024;
- struct with_file_name
- {
- with_file_name() { std::memset(file_name, 0, sizeof(file_name)); }
- void set_file_name(const char *file_name)
- {
- if (file_name != nullptr)
- {
- // This call to strncpy has parenthesis around it
- // to silence the GCC -Wstringop-truncation warning
- (strncpy(this->file_name, file_name, sizeof(this->file_name)));
- this->file_name[sizeof(this->file_name) - 1] = '\0';
- }
- else
- {
- this->file_name[0] = '\0';
- }
- }
- char file_name[max_file_name_length + 1];
- };
- struct with_file_line
- {
- with_file_line() { file_line = -1; }
- void set_file_line(int file_line) { this->file_line = file_line; }
- int file_line;
- };
- struct with_errno
- {
- with_errno() { errno_value = 0; }
- void set_errno(int errno_value) { this->errno_value = errno_value; }
- int errno_value;
- };
- struct can_not_open_file : base, with_file_name, with_errno
- {
- void format_error_message() const override
- {
- if (errno_value != 0)
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Can not open file \"%s\" because \"%s\".", file_name,
- std::strerror(errno_value));
- else
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Can not open file \"%s\".", file_name);
- }
- };
- struct line_length_limit_exceeded : base, with_file_name, with_file_line
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1.",
- file_line, file_name);
- }
- };
- } // namespace error
- class ByteSourceBase
- {
- public:
- virtual int read(char *buffer, int size) = 0;
- virtual ~ByteSourceBase() {}
- };
- namespace detail
- {
- class OwningStdIOByteSourceBase : public ByteSourceBase
- {
- public:
- explicit OwningStdIOByteSourceBase(FILE *file) : file(file)
- {
- // Tell the std library that we want to do the buffering ourself.
- std::setvbuf(file, 0, _IONBF, 0);
- }
- int read(char *buffer, int size) { return std::fread(buffer, 1, size, file); }
- ~OwningStdIOByteSourceBase() { std::fclose(file); }
- private:
- FILE *file;
- };
- class NonOwningIStreamByteSource : public ByteSourceBase
- {
- public:
- explicit NonOwningIStreamByteSource(std::istream &in) : in(in) {}
- int read(char *buffer, int size)
- {
- in.read(buffer, size);
- return in.gcount();
- }
- ~NonOwningIStreamByteSource() {}
- private:
- std::istream ∈
- };
- class NonOwningStringByteSource : public ByteSourceBase
- {
- public:
- NonOwningStringByteSource(const char *str, long long size)
- : str(str), remaining_byte_count(size) {}
- int read(char *buffer, int desired_byte_count)
- {
- int to_copy_byte_count = desired_byte_count;
- if (remaining_byte_count < to_copy_byte_count)
- to_copy_byte_count = remaining_byte_count;
- std::memcpy(buffer, str, to_copy_byte_count);
- remaining_byte_count -= to_copy_byte_count;
- str += to_copy_byte_count;
- return to_copy_byte_count;
- }
- ~NonOwningStringByteSource() {}
- private:
- const char *str;
- long long remaining_byte_count;
- };
- #ifndef CSV_IO_NO_THREAD
- class AsynchronousReader
- {
- public:
- void init(std::unique_ptr<ByteSourceBase> arg_byte_source)
- {
- std::unique_lock<std::mutex> guard(lock);
- byte_source = std::move(arg_byte_source);
- desired_byte_count = -1;
- termination_requested = false;
- worker = std::thread([&]
- {
- std::unique_lock<std::mutex> guard(lock);
- try {
- for (;;) {
- read_requested_condition.wait(guard, [&] {
- return desired_byte_count != -1 || termination_requested;
- });
- if (termination_requested)
- return;
- read_byte_count = byte_source->read(buffer, desired_byte_count);
- desired_byte_count = -1;
- if (read_byte_count == 0)
- break;
- read_finished_condition.notify_one();
- }
- } catch (...) {
- read_error = std::current_exception();
- }
- read_finished_condition.notify_one(); });
- }
- bool is_valid() const { return byte_source != nullptr; }
- void start_read(char *arg_buffer, int arg_desired_byte_count)
- {
- std::unique_lock<std::mutex> guard(lock);
- buffer = arg_buffer;
- desired_byte_count = arg_desired_byte_count;
- read_byte_count = -1;
- read_requested_condition.notify_one();
- }
- int finish_read()
- {
- std::unique_lock<std::mutex> guard(lock);
- read_finished_condition.wait(
- guard, [&]
- { return read_byte_count != -1 || read_error; });
- if (read_error)
- std::rethrow_exception(read_error);
- else
- return read_byte_count;
- }
- ~AsynchronousReader()
- {
- if (byte_source != nullptr)
- {
- {
- std::unique_lock<std::mutex> guard(lock);
- termination_requested = true;
- }
- read_requested_condition.notify_one();
- worker.join();
- }
- }
- private:
- std::unique_ptr<ByteSourceBase> byte_source;
- std::thread worker;
- bool termination_requested;
- std::exception_ptr read_error;
- char *buffer;
- int desired_byte_count;
- int read_byte_count;
- std::mutex lock;
- std::condition_variable read_finished_condition;
- std::condition_variable read_requested_condition;
- };
- #endif
- class SynchronousReader
- {
- public:
- void init(std::unique_ptr<ByteSourceBase> arg_byte_source)
- {
- byte_source = std::move(arg_byte_source);
- }
- bool is_valid() const { return byte_source != nullptr; }
- void start_read(char *arg_buffer, int arg_desired_byte_count)
- {
- buffer = arg_buffer;
- desired_byte_count = arg_desired_byte_count;
- }
- int finish_read() { return byte_source->read(buffer, desired_byte_count); }
- private:
- std::unique_ptr<ByteSourceBase> byte_source;
- char *buffer;
- int desired_byte_count;
- };
- } // namespace detail
- class LineReader
- {
- private:
- static const int block_len = 1 << 20;
- std::unique_ptr<char[]> buffer; // must be constructed before (and thus
- // destructed after) the reader!
- #ifdef CSV_IO_NO_THREAD
- detail::SynchronousReader reader;
- #else
- detail::AsynchronousReader reader;
- #endif
- int data_begin;
- int data_end;
- char file_name[error::max_file_name_length + 1];
- unsigned file_line;
- static std::unique_ptr<ByteSourceBase> open_file(const char *file_name)
- {
- // We open the file in binary mode as it makes no difference under *nix
- // and under Windows we handle \r\n newlines ourself.
- FILE *file = std::fopen(file_name, "rb");
- if (file == 0)
- {
- int x = errno; // store errno as soon as possible, doing it after
- // constructor call can fail.
- error::can_not_open_file err;
- err.set_errno(x);
- err.set_file_name(file_name);
- throw err;
- }
- return std::unique_ptr<ByteSourceBase>(
- new detail::OwningStdIOByteSourceBase(file));
- }
- void init(std::unique_ptr<ByteSourceBase> byte_source)
- {
- file_line = 0;
- buffer = std::unique_ptr<char[]>(new char[3 * block_len]);
- data_begin = 0;
- data_end = byte_source->read(buffer.get(), 2 * block_len);
- // Ignore UTF-8 BOM
- if (data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' &&
- buffer[2] == '\xBF')
- data_begin = 3;
- if (data_end == 2 * block_len)
- {
- reader.init(std::move(byte_source));
- reader.start_read(buffer.get() + 2 * block_len, block_len);
- }
- }
- public:
- LineReader() = delete;
- LineReader(const LineReader &) = delete;
- LineReader &operator=(const LineReader &) = delete;
- explicit LineReader(const char *file_name)
- {
- set_file_name(file_name);
- init(open_file(file_name));
- }
- explicit LineReader(const std::string &file_name)
- {
- set_file_name(file_name.c_str());
- init(open_file(file_name.c_str()));
- }
- LineReader(const char *file_name,
- std::unique_ptr<ByteSourceBase> byte_source)
- {
- set_file_name(file_name);
- init(std::move(byte_source));
- }
- LineReader(const std::string &file_name,
- std::unique_ptr<ByteSourceBase> byte_source)
- {
- set_file_name(file_name.c_str());
- init(std::move(byte_source));
- }
- LineReader(const char *file_name, const char *data_begin,
- const char *data_end)
- {
- set_file_name(file_name);
- init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
- data_begin, data_end - data_begin)));
- }
- LineReader(const std::string &file_name, const char *data_begin,
- const char *data_end)
- {
- set_file_name(file_name.c_str());
- init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
- data_begin, data_end - data_begin)));
- }
- LineReader(const char *file_name, FILE *file)
- {
- set_file_name(file_name);
- init(std::unique_ptr<ByteSourceBase>(
- new detail::OwningStdIOByteSourceBase(file)));
- }
- LineReader(const std::string &file_name, FILE *file)
- {
- set_file_name(file_name.c_str());
- init(std::unique_ptr<ByteSourceBase>(
- new detail::OwningStdIOByteSourceBase(file)));
- }
- LineReader(const char *file_name, std::istream &in)
- {
- set_file_name(file_name);
- init(std::unique_ptr<ByteSourceBase>(
- new detail::NonOwningIStreamByteSource(in)));
- }
- LineReader(const std::string &file_name, std::istream &in)
- {
- set_file_name(file_name.c_str());
- init(std::unique_ptr<ByteSourceBase>(
- new detail::NonOwningIStreamByteSource(in)));
- }
- void set_file_name(const std::string &file_name)
- {
- set_file_name(file_name.c_str());
- }
- void set_file_name(const char *file_name)
- {
- if (file_name != nullptr)
- {
- strncpy(this->file_name, file_name, sizeof(this->file_name));
- this->file_name[sizeof(this->file_name) - 1] = '\0';
- }
- else
- {
- this->file_name[0] = '\0';
- }
- }
- const char *get_truncated_file_name() const { return file_name; }
- void set_file_line(unsigned file_line) { this->file_line = file_line; }
- unsigned get_file_line() const { return file_line; }
- char *next_line()
- {
- if (data_begin == data_end)
- return nullptr;
- ++file_line;
- assert(data_begin < data_end);
- assert(data_end <= block_len * 2);
- if (data_begin >= block_len)
- {
- std::memcpy(buffer.get(), buffer.get() + block_len, block_len);
- data_begin -= block_len;
- data_end -= block_len;
- if (reader.is_valid())
- {
- data_end += reader.finish_read();
- std::memcpy(buffer.get() + block_len, buffer.get() + 2 * block_len,
- block_len);
- reader.start_read(buffer.get() + 2 * block_len, block_len);
- }
- }
- int line_end = data_begin;
- while (line_end != data_end && buffer[line_end] != '\n')
- {
- ++line_end;
- }
- if (line_end - data_begin + 1 > block_len)
- {
- error::line_length_limit_exceeded err;
- err.set_file_name(file_name);
- err.set_file_line(file_line);
- throw err;
- }
- if (line_end != data_end && buffer[line_end] == '\n')
- {
- buffer[line_end] = '\0';
- }
- else
- {
- // some files are missing the newline at the end of the
- // last line
- ++data_end;
- buffer[line_end] = '\0';
- }
- // handle windows \r\n-line breaks
- if (line_end != data_begin && buffer[line_end - 1] == '\r')
- buffer[line_end - 1] = '\0';
- char *ret = buffer.get() + data_begin;
- data_begin = line_end + 1;
- return ret;
- }
- };
- ////////////////////////////////////////////////////////////////////////////
- // CSV //
- ////////////////////////////////////////////////////////////////////////////
- namespace error
- {
- const int max_column_name_length = 63;
- struct with_column_name
- {
- with_column_name()
- {
- std::memset(column_name, 0, max_column_name_length + 1);
- }
- void set_column_name(const char *column_name)
- {
- if (column_name != nullptr)
- {
- std::strncpy(this->column_name, column_name, max_column_name_length);
- this->column_name[max_column_name_length] = '\0';
- }
- else
- {
- this->column_name[0] = '\0';
- }
- }
- char column_name[max_column_name_length + 1];
- };
- const int max_column_content_length = 63;
- struct with_column_content
- {
- with_column_content()
- {
- std::memset(column_content, 0, max_column_content_length + 1);
- }
- void set_column_content(const char *column_content)
- {
- if (column_content != nullptr)
- {
- std::strncpy(this->column_content, column_content,
- max_column_content_length);
- this->column_content[max_column_content_length] = '\0';
- }
- else
- {
- this->column_content[0] = '\0';
- }
- }
- char column_content[max_column_content_length + 1];
- };
- struct extra_column_in_header : base, with_file_name, with_column_name
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- R"(Extra column "%s" in header of file "%s".)", column_name,
- file_name);
- }
- };
- struct missing_column_in_header : base, with_file_name, with_column_name
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- R"(Missing column "%s" in header of file "%s".)", column_name,
- file_name);
- }
- };
- struct duplicated_column_in_header : base, with_file_name, with_column_name
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- R"(Duplicated column "%s" in header of file "%s".)",
- column_name, file_name);
- }
- };
- struct header_missing : base, with_file_name
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Header missing in file \"%s\".", file_name);
- }
- };
- struct too_few_columns : base, with_file_name, with_file_line
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Too few columns in line %d in file \"%s\".", file_line,
- file_name);
- }
- };
- struct too_many_columns : base, with_file_name, with_file_line
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Too many columns in line %d in file \"%s\".", file_line,
- file_name);
- }
- };
- struct escaped_string_not_closed : base, with_file_name, with_file_line
- {
- void format_error_message() const override
- {
- std::snprintf(error_message_buffer, sizeof(error_message_buffer),
- "Escaped string was not closed in line %d in file \"%s\".",
- file_line, file_name);
- }
- };
- struct integer_must_be_positive : base,
- with_file_name,
- with_file_line,
- with_column_name,
- with_column_content
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)",
- column_content, column_name, file_name, file_line);
- }
- };
- struct no_digit : base,
- with_file_name,
- with_file_line,
- with_column_name,
- with_column_content
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)",
- column_content, column_name, file_name, file_line);
- }
- };
- struct integer_overflow : base,
- with_file_name,
- with_file_line,
- with_column_name,
- with_column_content
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)",
- column_content, column_name, file_name, file_line);
- }
- };
- struct integer_underflow : base,
- with_file_name,
- with_file_line,
- with_column_name,
- with_column_content
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)",
- column_content, column_name, file_name, file_line);
- }
- };
- struct invalid_single_character : base,
- with_file_name,
- with_file_line,
- with_column_name,
- with_column_content
- {
- void format_error_message() const override
- {
- std::snprintf(
- error_message_buffer, sizeof(error_message_buffer),
- R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)",
- column_content, column_name, file_name, file_line);
- }
- };
- } // namespace error
- using ignore_column = unsigned int;
- static const ignore_column ignore_no_column = 0;
- static const ignore_column ignore_extra_column = 1;
- static const ignore_column ignore_missing_column = 2;
- template <char... trim_char_list>
- struct trim_chars
- {
- private:
- constexpr static bool is_trim_char(char) { return false; }
- template <class... OtherTrimChars>
- constexpr static bool is_trim_char(char c, char trim_char,
- OtherTrimChars... other_trim_chars)
- {
- return c == trim_char || is_trim_char(c, other_trim_chars...);
- }
- public:
- static void trim(char *&str_begin, char *&str_end)
- {
- while (str_begin != str_end && is_trim_char(*str_begin, trim_char_list...))
- ++str_begin;
- while (str_begin != str_end &&
- is_trim_char(*(str_end - 1), trim_char_list...))
- --str_end;
- *str_end = '\0';
- }
- };
- struct no_comment
- {
- static bool is_comment(const char *) { return false; }
- };
- template <char... comment_start_char_list>
- struct single_line_comment
- {
- private:
- constexpr static bool is_comment_start_char(char) { return false; }
- template <class... OtherCommentStartChars>
- constexpr static bool
- is_comment_start_char(char c, char comment_start_char,
- OtherCommentStartChars... other_comment_start_chars)
- {
- return c == comment_start_char ||
- is_comment_start_char(c, other_comment_start_chars...);
- }
- public:
- static bool is_comment(const char *line)
- {
- return is_comment_start_char(*line, comment_start_char_list...);
- }
- };
- struct empty_line_comment
- {
- static bool is_comment(const char *line)
- {
- if (*line == '\0')
- return true;
- while (*line == ' ' || *line == '\t')
- {
- ++line;
- if (*line == 0)
- return true;
- }
- return false;
- }
- };
- template <char... comment_start_char_list>
- struct single_and_empty_line_comment
- {
- static bool is_comment(const char *line)
- {
- return single_line_comment<comment_start_char_list...>::is_comment(line) ||
- empty_line_comment::is_comment(line);
- }
- };
- template <char sep>
- struct no_quote_escape
- {
- static const char *find_next_column_end(const char *col_begin)
- {
- while (*col_begin != sep && *col_begin != '\0')
- ++col_begin;
- return col_begin;
- }
- static void unescape(char *&, char *&) {}
- };
- template <char sep, char quote>
- struct double_quote_escape
- {
- static const char *find_next_column_end(const char *col_begin)
- {
- while (*col_begin != sep && *col_begin != '\0')
- if (*col_begin != quote)
- ++col_begin;
- else
- {
- do
- {
- ++col_begin;
- while (*col_begin != quote)
- {
- if (*col_begin == '\0')
- throw error::escaped_string_not_closed();
- ++col_begin;
- }
- ++col_begin;
- } while (*col_begin == quote);
- }
- return col_begin;
- }
- static void unescape(char *&col_begin, char *&col_end)
- {
- if (col_end - col_begin >= 2)
- {
- if (*col_begin == quote && *(col_end - 1) == quote)
- {
- ++col_begin;
- --col_end;
- char *out = col_begin;
- for (char *in = col_begin; in != col_end; ++in)
- {
- if (*in == quote && (in + 1) != col_end && *(in + 1) == quote)
- {
- ++in;
- }
- *out = *in;
- ++out;
- }
- col_end = out;
- *col_end = '\0';
- }
- }
- }
- };
- struct throw_on_overflow
- {
- template <class T>
- static void on_overflow(T &)
- {
- throw error::integer_overflow();
- }
- template <class T>
- static void on_underflow(T &)
- {
- throw error::integer_underflow();
- }
- };
- struct ignore_overflow
- {
- template <class T>
- static void on_overflow(T &) {}
- template <class T>
- static void on_underflow(T &) {}
- };
- struct set_to_max_on_overflow
- {
- template <class T>
- static void on_overflow(T &x)
- {
- // using (std::numeric_limits<T>::max) instead of
- // std::numeric_limits<T>::max to make code including windows.h with its max
- // macro happy
- x = (std::numeric_limits<T>::max)();
- }
- template <class T>
- static void on_underflow(T &x)
- {
- x = (std::numeric_limits<T>::min)();
- }
- };
- namespace detail
- {
- template <class quote_policy>
- void chop_next_column(char *&line, char *&col_begin, char *&col_end)
- {
- assert(line != nullptr);
- col_begin = line;
- // the col_begin + (... - col_begin) removes the constness
- col_end =
- col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin);
- if (*col_end == '\0')
- {
- line = nullptr;
- }
- else
- {
- *col_end = '\0';
- line = col_end + 1;
- }
- }
- template <class trim_policy, class quote_policy>
- void parse_line(char *line, char **sorted_col,
- const std::vector<int> &col_order)
- {
- for (int i : col_order)
- {
- if (line == nullptr)
- throw ::io::error::too_few_columns();
- char *col_begin, *col_end;
- chop_next_column<quote_policy>(line, col_begin, col_end);
- if (i != -1)
- {
- trim_policy::trim(col_begin, col_end);
- quote_policy::unescape(col_begin, col_end);
- sorted_col[i] = col_begin;
- }
- }
- if (line != nullptr)
- throw ::io::error::too_many_columns();
- }
- template <unsigned column_count, class trim_policy, class quote_policy>
- void parse_header_line(char *line, std::vector<int> &col_order,
- const std::string *col_name,
- ignore_column ignore_policy)
- {
- col_order.clear();
- bool found[column_count];
- std::fill(found, found + column_count, false);
- while (line)
- {
- char *col_begin, *col_end;
- chop_next_column<quote_policy>(line, col_begin, col_end);
- trim_policy::trim(col_begin, col_end);
- quote_policy::unescape(col_begin, col_end);
- for (unsigned i = 0; i < column_count; ++i)
- if (col_begin == col_name[i])
- {
- if (found[i])
- {
- error::duplicated_column_in_header err;
- err.set_column_name(col_begin);
- throw err;
- }
- found[i] = true;
- col_order.push_back(i);
- col_begin = 0;
- break;
- }
- if (col_begin)
- {
- if (ignore_policy & ::io::ignore_extra_column)
- col_order.push_back(-1);
- else
- {
- error::extra_column_in_header err;
- err.set_column_name(col_begin);
- throw err;
- }
- }
- }
- if (!(ignore_policy & ::io::ignore_missing_column))
- {
- for (unsigned i = 0; i < column_count; ++i)
- {
- if (!found[i])
- {
- error::missing_column_in_header err;
- err.set_column_name(col_name[i].c_str());
- throw err;
- }
- }
- }
- }
- template <class overflow_policy>
- void parse(char *col, char &x)
- {
- if (!*col)
- throw error::invalid_single_character();
- x = *col;
- ++col;
- if (*col)
- throw error::invalid_single_character();
- }
- template <class overflow_policy>
- void parse(char *col, std::string &x)
- {
- x = col;
- }
- template <class overflow_policy>
- void parse(char *col, const char *&x)
- {
- x = col;
- }
- template <class overflow_policy>
- void parse(char *col, char *&x) { x = col; }
- template <class overflow_policy, class T>
- void parse_unsigned_integer(const char *col, T &x)
- {
- x = 0;
- while (*col != '\0')
- {
- if ('0' <= *col && *col <= '9')
- {
- T y = *col - '0';
- if (x > ((std::numeric_limits<T>::max)() - y) / 10)
- {
- overflow_policy::on_overflow(x);
- return;
- }
- x = 10 * x + y;
- }
- else
- throw error::no_digit();
- ++col;
- }
- }
- template <class overflow_policy>
- void parse(char *col, unsigned char &x)
- {
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, unsigned short &x)
- {
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, unsigned int &x)
- {
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, unsigned long &x)
- {
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, unsigned long long &x)
- {
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy, class T>
- void parse_signed_integer(const char *col, T &x)
- {
- if (*col == '-')
- {
- ++col;
- x = 0;
- while (*col != '\0')
- {
- if ('0' <= *col && *col <= '9')
- {
- T y = *col - '0';
- if (x < ((std::numeric_limits<T>::min)() + y) / 10)
- {
- overflow_policy::on_underflow(x);
- return;
- }
- x = 10 * x - y;
- }
- else
- throw error::no_digit();
- ++col;
- }
- return;
- }
- else if (*col == '+')
- ++col;
- parse_unsigned_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, signed char &x)
- {
- parse_signed_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, signed short &x)
- {
- parse_signed_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, signed int &x)
- {
- parse_signed_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, signed long &x)
- {
- parse_signed_integer<overflow_policy>(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, signed long long &x)
- {
- parse_signed_integer<overflow_policy>(col, x);
- }
- template <class T>
- void parse_float(const char *col, T &x)
- {
- bool is_neg = false;
- if (*col == '-')
- {
- is_neg = true;
- ++col;
- }
- else if (*col == '+')
- ++col;
- x = 0;
- while ('0' <= *col && *col <= '9')
- {
- int y = *col - '0';
- x *= 10;
- x += y;
- ++col;
- }
- if (*col == '.' || *col == ',')
- {
- ++col;
- T pos = 1;
- while ('0' <= *col && *col <= '9')
- {
- pos /= 10;
- int y = *col - '0';
- ++col;
- x += y * pos;
- }
- }
- if (*col == 'e' || *col == 'E')
- {
- ++col;
- int e;
- parse_signed_integer<set_to_max_on_overflow>(col, e);
- if (e != 0)
- {
- T base;
- if (e < 0)
- {
- base = T(0.1);
- e = -e;
- }
- else
- {
- base = T(10);
- }
- while (e != 1)
- {
- if ((e & 1) == 0)
- {
- base = base * base;
- e >>= 1;
- }
- else
- {
- x *= base;
- --e;
- }
- }
- x *= base;
- }
- }
- else
- {
- if (*col != '\0')
- throw error::no_digit();
- }
- if (is_neg)
- x = -x;
- }
- template <class overflow_policy>
- void parse(char *col, float &x)
- {
- parse_float(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, double &x)
- {
- parse_float(col, x);
- }
- template <class overflow_policy>
- void parse(char *col, long double &x)
- {
- parse_float(col, x);
- }
- template <class overflow_policy, class T>
- void parse(char *col, T &x)
- {
- // Mute unused variable compiler warning
- (void)col;
- (void)x;
- // GCC evaluates "false" when reading the template and
- // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why
- // this strange construct is used.
- static_assert(sizeof(T) != sizeof(T),
- "Can not parse this type. Only builtin integrals, floats, "
- "char, char*, const char* and std::string are supported");
- }
- } // namespace detail
- template <unsigned column_count, class trim_policy = trim_chars<' ', '\t'>,
- class quote_policy = no_quote_escape<','>,
- class overflow_policy = throw_on_overflow,
- class comment_policy = no_comment>
- class CSVReader
- {
- private:
- LineReader in;
- char *row[column_count];
- std::string column_names[column_count];
- std::vector<int> col_order;
- template <class... ColNames>
- void set_column_names(std::string s, ColNames... cols)
- {
- column_names[column_count - sizeof...(ColNames) - 1] = std::move(s);
- set_column_names(std::forward<ColNames>(cols)...);
- }
- void set_column_names() {}
- public:
- CSVReader() = delete;
- CSVReader(const CSVReader &) = delete;
- CSVReader &operator=(const CSVReader &);
- template <class... Args>
- explicit CSVReader(Args &&...args) : in(std::forward<Args>(args)...)
- {
- std::fill(row, row + column_count, nullptr);
- col_order.resize(column_count);
- for (unsigned i = 0; i < column_count; ++i)
- col_order[i] = i;
- for (unsigned i = 1; i <= column_count; ++i)
- column_names[i - 1] = "col" + std::to_string(i);
- }
- char *next_line() { return in.next_line(); }
- template <class... ColNames>
- void read_header(ignore_column ignore_policy, ColNames... cols)
- {
- static_assert(sizeof...(ColNames) >= column_count,
- "not enough column names specified");
- static_assert(sizeof...(ColNames) <= column_count,
- "too many column names specified");
- try
- {
- set_column_names(std::forward<ColNames>(cols)...);
- char *line;
- do
- {
- line = in.next_line();
- if (!line)
- throw error::header_missing();
- } while (comment_policy::is_comment(line));
- detail::parse_header_line<column_count, trim_policy, quote_policy>(
- line, col_order, column_names, ignore_policy);
- }
- catch (error::with_file_name &err)
- {
- err.set_file_name(in.get_truncated_file_name());
- throw;
- }
- }
- template <class... ColNames>
- void set_header(ColNames... cols)
- {
- static_assert(sizeof...(ColNames) >= column_count,
- "not enough column names specified");
- static_assert(sizeof...(ColNames) <= column_count,
- "too many column names specified");
- set_column_names(std::forward<ColNames>(cols)...);
- std::fill(row, row + column_count, nullptr);
- col_order.resize(column_count);
- for (unsigned i = 0; i < column_count; ++i)
- col_order[i] = i;
- }
- bool has_column(const std::string &name) const
- {
- return col_order.end() !=
- std::find(col_order.begin(), col_order.end(),
- std::find(std::begin(column_names), std::end(column_names),
- name) -
- std::begin(column_names));
- }
- void set_file_name(const std::string &file_name)
- {
- in.set_file_name(file_name);
- }
- void set_file_name(const char *file_name) { in.set_file_name(file_name); }
- const char *get_truncated_file_name() const
- {
- return in.get_truncated_file_name();
- }
- void set_file_line(unsigned file_line) { in.set_file_line(file_line); }
- unsigned get_file_line() const { return in.get_file_line(); }
- private:
- void parse_helper(std::size_t) {}
- template <class T, class... ColType>
- void parse_helper(std::size_t r, T &t, ColType &...cols)
- {
- if (row[r])
- {
- try
- {
- try
- {
- ::io::detail::parse<overflow_policy>(row[r], t);
- }
- catch (error::with_column_content &err)
- {
- err.set_column_content(row[r]);
- throw;
- }
- }
- catch (error::with_column_name &err)
- {
- err.set_column_name(column_names[r].c_str());
- throw;
- }
- }
- parse_helper(r + 1, cols...);
- }
- public:
- template <class... ColType>
- bool read_row(ColType &...cols)
- {
- static_assert(sizeof...(ColType) >= column_count,
- "not enough columns specified");
- static_assert(sizeof...(ColType) <= column_count,
- "too many columns specified");
- try
- {
- try
- {
- char *line;
- do
- {
- line = in.next_line();
- if (!line)
- return false;
- } while (comment_policy::is_comment(line));
- detail::parse_line<trim_policy, quote_policy>(line, row, col_order);
- parse_helper(0, cols...);
- }
- catch (error::with_file_name &err)
- {
- err.set_file_name(in.get_truncated_file_name());
- throw;
- }
- }
- catch (error::with_file_line &err)
- {
- err.set_file_line(in.get_file_line());
- throw;
- }
- return true;
- }
- };
- // CSVWriter
- class LineWriter
- {
- private:
- std::ofstream file;
- public:
- LineWriter(const std::string &filename)
- {
- file.open(filename);
- if (!file.is_open())
- {
- throw std::runtime_error("Unable to open file");
- }
- }
- ~LineWriter()
- {
- if (file.is_open())
- {
- file.close();
- }
- }
- void write(const std::string &line)
- {
- file << line;
- }
- };
- // CSVWriter 的实现
- template <unsigned column_count,
- class trim_policy = std::string, // 使用默认值 ' ' 和 '\t'
- class quote_policy = std::string, // 在这里我们可以简单化
- class overflow_policy = std::runtime_error>
- class CSVWriter
- {
- private:
- LineWriter out;
- std::string column_names[column_count];
- void write_header()
- {
- for (unsigned i = 0; i < column_count; ++i)
- {
- if (i > 0)
- out.write(",");
- out.write(column_names[i]);
- }
- out.write("\n");
- }
- public:
- CSVWriter(const std::string &filename, const std::initializer_list<std::string> &cols)
- : out(filename)
- {
- // 使用初始化列表设置列名
- unsigned index = 0;
- for (const auto &col : cols)
- {
- if (index < column_count)
- {
- column_names[index++] = col;
- }
- else
- {
- throw overflow_policy("Too many column names specified");
- }
- }
- write_header(); // 写入头部
- }
- template <class... ColType>
- void write_row(ColType... cols)
- {
- write_row_helper(cols...);
- out.write("\n");
- }
- private:
- template <typename T>
- void write_value(const T &value)
- {
- out.write(std::to_string(value)); // 简化,不进行引用处理
- }
- template <typename T, typename... Rest>
- void write_row_helper(const T &value, const Rest &...rest)
- {
- write_value(value);
- if constexpr (sizeof...(rest) > 0)
- {
- out.write(",");
- write_row_helper(rest...);
- }
- }
- };
- } // namespace io
- #endif
|