csv.h 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550
  1. // Copyright: (2012-2015) Ben Strasser <code@ben-strasser.net>
  2. // License: BSD-3
  3. //
  4. // All rights reserved.
  5. //
  6. // Redistribution and use in source and binary forms, with or without
  7. // modification, are permitted provided that the following conditions are met:
  8. //
  9. // 1. Redistributions of source code must retain the above copyright notice,
  10. // this list of conditions and the following disclaimer.
  11. //
  12. // 2. Redistributions in binary form must reproduce the above copyright notice,
  13. // this list of conditions and the following disclaimer in the documentation
  14. // and/or other materials provided with the distribution.
  15. //
  16. // 3. Neither the name of the copyright holder nor the names of its contributors
  17. // may be used to endorse or promote products derived from this software
  18. // without specific prior written permission.
  19. //
  20. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  21. // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  22. // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  23. // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  24. // LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  25. // CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  26. // SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  27. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  28. // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  29. // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  30. // POSSIBILITY OF SUCH DAMAGE.
  31. #ifndef CSV_H
  32. #define CSV_H
  33. #include <algorithm>
  34. #include <cstdio>
  35. #include <cstring>
  36. #include <exception>
  37. #include <string>
  38. #include <utility>
  39. #include <vector>
  40. #ifndef CSV_IO_NO_THREAD
  41. #include <condition_variable>
  42. #include <mutex>
  43. #include <thread>
  44. #endif
  45. #include <cassert>
  46. #include <cerrno>
  47. #include <istream>
  48. #include <limits>
  49. #include <memory>
  50. #include <fstream>
  51. namespace io
  52. {
  53. ////////////////////////////////////////////////////////////////////////////
  54. // LineReader //
  55. ////////////////////////////////////////////////////////////////////////////
  56. namespace error
  57. {
  58. struct base : std::exception
  59. {
  60. virtual void format_error_message() const = 0;
  61. const char *what() const noexcept override
  62. {
  63. format_error_message();
  64. return error_message_buffer;
  65. }
  66. mutable char error_message_buffer[2048];
  67. };
  68. // this only affects the file name in the error message
  69. const int max_file_name_length = 1024;
  70. struct with_file_name
  71. {
  72. with_file_name() { std::memset(file_name, 0, sizeof(file_name)); }
  73. void set_file_name(const char *file_name)
  74. {
  75. if (file_name != nullptr)
  76. {
  77. // This call to strncpy has parenthesis around it
  78. // to silence the GCC -Wstringop-truncation warning
  79. (strncpy(this->file_name, file_name, sizeof(this->file_name)));
  80. this->file_name[sizeof(this->file_name) - 1] = '\0';
  81. }
  82. else
  83. {
  84. this->file_name[0] = '\0';
  85. }
  86. }
  87. char file_name[max_file_name_length + 1];
  88. };
  89. struct with_file_line
  90. {
  91. with_file_line() { file_line = -1; }
  92. void set_file_line(int file_line) { this->file_line = file_line; }
  93. int file_line;
  94. };
  95. struct with_errno
  96. {
  97. with_errno() { errno_value = 0; }
  98. void set_errno(int errno_value) { this->errno_value = errno_value; }
  99. int errno_value;
  100. };
  101. struct can_not_open_file : base, with_file_name, with_errno
  102. {
  103. void format_error_message() const override
  104. {
  105. if (errno_value != 0)
  106. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  107. "Can not open file \"%s\" because \"%s\".", file_name,
  108. std::strerror(errno_value));
  109. else
  110. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  111. "Can not open file \"%s\".", file_name);
  112. }
  113. };
  114. struct line_length_limit_exceeded : base, with_file_name, with_file_line
  115. {
  116. void format_error_message() const override
  117. {
  118. std::snprintf(
  119. error_message_buffer, sizeof(error_message_buffer),
  120. "Line number %d in file \"%s\" exceeds the maximum length of 2^24-1.",
  121. file_line, file_name);
  122. }
  123. };
  124. } // namespace error
  125. class ByteSourceBase
  126. {
  127. public:
  128. virtual int read(char *buffer, int size) = 0;
  129. virtual ~ByteSourceBase() {}
  130. };
  131. namespace detail
  132. {
  133. class OwningStdIOByteSourceBase : public ByteSourceBase
  134. {
  135. public:
  136. explicit OwningStdIOByteSourceBase(FILE *file) : file(file)
  137. {
  138. // Tell the std library that we want to do the buffering ourself.
  139. std::setvbuf(file, 0, _IONBF, 0);
  140. }
  141. int read(char *buffer, int size) { return std::fread(buffer, 1, size, file); }
  142. ~OwningStdIOByteSourceBase() { std::fclose(file); }
  143. private:
  144. FILE *file;
  145. };
  146. class NonOwningIStreamByteSource : public ByteSourceBase
  147. {
  148. public:
  149. explicit NonOwningIStreamByteSource(std::istream &in) : in(in) {}
  150. int read(char *buffer, int size)
  151. {
  152. in.read(buffer, size);
  153. return in.gcount();
  154. }
  155. ~NonOwningIStreamByteSource() {}
  156. private:
  157. std::istream &in;
  158. };
  159. class NonOwningStringByteSource : public ByteSourceBase
  160. {
  161. public:
  162. NonOwningStringByteSource(const char *str, long long size)
  163. : str(str), remaining_byte_count(size) {}
  164. int read(char *buffer, int desired_byte_count)
  165. {
  166. int to_copy_byte_count = desired_byte_count;
  167. if (remaining_byte_count < to_copy_byte_count)
  168. to_copy_byte_count = remaining_byte_count;
  169. std::memcpy(buffer, str, to_copy_byte_count);
  170. remaining_byte_count -= to_copy_byte_count;
  171. str += to_copy_byte_count;
  172. return to_copy_byte_count;
  173. }
  174. ~NonOwningStringByteSource() {}
  175. private:
  176. const char *str;
  177. long long remaining_byte_count;
  178. };
  179. #ifndef CSV_IO_NO_THREAD
  180. class AsynchronousReader
  181. {
  182. public:
  183. void init(std::unique_ptr<ByteSourceBase> arg_byte_source)
  184. {
  185. std::unique_lock<std::mutex> guard(lock);
  186. byte_source = std::move(arg_byte_source);
  187. desired_byte_count = -1;
  188. termination_requested = false;
  189. worker = std::thread([&]
  190. {
  191. std::unique_lock<std::mutex> guard(lock);
  192. try {
  193. for (;;) {
  194. read_requested_condition.wait(guard, [&] {
  195. return desired_byte_count != -1 || termination_requested;
  196. });
  197. if (termination_requested)
  198. return;
  199. read_byte_count = byte_source->read(buffer, desired_byte_count);
  200. desired_byte_count = -1;
  201. if (read_byte_count == 0)
  202. break;
  203. read_finished_condition.notify_one();
  204. }
  205. } catch (...) {
  206. read_error = std::current_exception();
  207. }
  208. read_finished_condition.notify_one(); });
  209. }
  210. bool is_valid() const { return byte_source != nullptr; }
  211. void start_read(char *arg_buffer, int arg_desired_byte_count)
  212. {
  213. std::unique_lock<std::mutex> guard(lock);
  214. buffer = arg_buffer;
  215. desired_byte_count = arg_desired_byte_count;
  216. read_byte_count = -1;
  217. read_requested_condition.notify_one();
  218. }
  219. int finish_read()
  220. {
  221. std::unique_lock<std::mutex> guard(lock);
  222. read_finished_condition.wait(
  223. guard, [&]
  224. { return read_byte_count != -1 || read_error; });
  225. if (read_error)
  226. std::rethrow_exception(read_error);
  227. else
  228. return read_byte_count;
  229. }
  230. ~AsynchronousReader()
  231. {
  232. if (byte_source != nullptr)
  233. {
  234. {
  235. std::unique_lock<std::mutex> guard(lock);
  236. termination_requested = true;
  237. }
  238. read_requested_condition.notify_one();
  239. worker.join();
  240. }
  241. }
  242. private:
  243. std::unique_ptr<ByteSourceBase> byte_source;
  244. std::thread worker;
  245. bool termination_requested;
  246. std::exception_ptr read_error;
  247. char *buffer;
  248. int desired_byte_count;
  249. int read_byte_count;
  250. std::mutex lock;
  251. std::condition_variable read_finished_condition;
  252. std::condition_variable read_requested_condition;
  253. };
  254. #endif
  255. class SynchronousReader
  256. {
  257. public:
  258. void init(std::unique_ptr<ByteSourceBase> arg_byte_source)
  259. {
  260. byte_source = std::move(arg_byte_source);
  261. }
  262. bool is_valid() const { return byte_source != nullptr; }
  263. void start_read(char *arg_buffer, int arg_desired_byte_count)
  264. {
  265. buffer = arg_buffer;
  266. desired_byte_count = arg_desired_byte_count;
  267. }
  268. int finish_read() { return byte_source->read(buffer, desired_byte_count); }
  269. private:
  270. std::unique_ptr<ByteSourceBase> byte_source;
  271. char *buffer;
  272. int desired_byte_count;
  273. };
  274. } // namespace detail
  275. class LineReader
  276. {
  277. private:
  278. static const int block_len = 1 << 20;
  279. std::unique_ptr<char[]> buffer; // must be constructed before (and thus
  280. // destructed after) the reader!
  281. #ifdef CSV_IO_NO_THREAD
  282. detail::SynchronousReader reader;
  283. #else
  284. detail::AsynchronousReader reader;
  285. #endif
  286. int data_begin;
  287. int data_end;
  288. char file_name[error::max_file_name_length + 1];
  289. unsigned file_line;
  290. static std::unique_ptr<ByteSourceBase> open_file(const char *file_name)
  291. {
  292. // We open the file in binary mode as it makes no difference under *nix
  293. // and under Windows we handle \r\n newlines ourself.
  294. FILE *file = std::fopen(file_name, "rb");
  295. if (file == 0)
  296. {
  297. int x = errno; // store errno as soon as possible, doing it after
  298. // constructor call can fail.
  299. error::can_not_open_file err;
  300. err.set_errno(x);
  301. err.set_file_name(file_name);
  302. throw err;
  303. }
  304. return std::unique_ptr<ByteSourceBase>(
  305. new detail::OwningStdIOByteSourceBase(file));
  306. }
  307. void init(std::unique_ptr<ByteSourceBase> byte_source)
  308. {
  309. file_line = 0;
  310. buffer = std::unique_ptr<char[]>(new char[3 * block_len]);
  311. data_begin = 0;
  312. data_end = byte_source->read(buffer.get(), 2 * block_len);
  313. // Ignore UTF-8 BOM
  314. if (data_end >= 3 && buffer[0] == '\xEF' && buffer[1] == '\xBB' &&
  315. buffer[2] == '\xBF')
  316. data_begin = 3;
  317. if (data_end == 2 * block_len)
  318. {
  319. reader.init(std::move(byte_source));
  320. reader.start_read(buffer.get() + 2 * block_len, block_len);
  321. }
  322. }
  323. public:
  324. LineReader() = delete;
  325. LineReader(const LineReader &) = delete;
  326. LineReader &operator=(const LineReader &) = delete;
  327. explicit LineReader(const char *file_name)
  328. {
  329. set_file_name(file_name);
  330. init(open_file(file_name));
  331. }
  332. explicit LineReader(const std::string &file_name)
  333. {
  334. set_file_name(file_name.c_str());
  335. init(open_file(file_name.c_str()));
  336. }
  337. LineReader(const char *file_name,
  338. std::unique_ptr<ByteSourceBase> byte_source)
  339. {
  340. set_file_name(file_name);
  341. init(std::move(byte_source));
  342. }
  343. LineReader(const std::string &file_name,
  344. std::unique_ptr<ByteSourceBase> byte_source)
  345. {
  346. set_file_name(file_name.c_str());
  347. init(std::move(byte_source));
  348. }
  349. LineReader(const char *file_name, const char *data_begin,
  350. const char *data_end)
  351. {
  352. set_file_name(file_name);
  353. init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
  354. data_begin, data_end - data_begin)));
  355. }
  356. LineReader(const std::string &file_name, const char *data_begin,
  357. const char *data_end)
  358. {
  359. set_file_name(file_name.c_str());
  360. init(std::unique_ptr<ByteSourceBase>(new detail::NonOwningStringByteSource(
  361. data_begin, data_end - data_begin)));
  362. }
  363. LineReader(const char *file_name, FILE *file)
  364. {
  365. set_file_name(file_name);
  366. init(std::unique_ptr<ByteSourceBase>(
  367. new detail::OwningStdIOByteSourceBase(file)));
  368. }
  369. LineReader(const std::string &file_name, FILE *file)
  370. {
  371. set_file_name(file_name.c_str());
  372. init(std::unique_ptr<ByteSourceBase>(
  373. new detail::OwningStdIOByteSourceBase(file)));
  374. }
  375. LineReader(const char *file_name, std::istream &in)
  376. {
  377. set_file_name(file_name);
  378. init(std::unique_ptr<ByteSourceBase>(
  379. new detail::NonOwningIStreamByteSource(in)));
  380. }
  381. LineReader(const std::string &file_name, std::istream &in)
  382. {
  383. set_file_name(file_name.c_str());
  384. init(std::unique_ptr<ByteSourceBase>(
  385. new detail::NonOwningIStreamByteSource(in)));
  386. }
  387. void set_file_name(const std::string &file_name)
  388. {
  389. set_file_name(file_name.c_str());
  390. }
  391. void set_file_name(const char *file_name)
  392. {
  393. if (file_name != nullptr)
  394. {
  395. strncpy(this->file_name, file_name, sizeof(this->file_name));
  396. this->file_name[sizeof(this->file_name) - 1] = '\0';
  397. }
  398. else
  399. {
  400. this->file_name[0] = '\0';
  401. }
  402. }
  403. const char *get_truncated_file_name() const { return file_name; }
  404. void set_file_line(unsigned file_line) { this->file_line = file_line; }
  405. unsigned get_file_line() const { return file_line; }
  406. char *next_line()
  407. {
  408. if (data_begin == data_end)
  409. return nullptr;
  410. ++file_line;
  411. assert(data_begin < data_end);
  412. assert(data_end <= block_len * 2);
  413. if (data_begin >= block_len)
  414. {
  415. std::memcpy(buffer.get(), buffer.get() + block_len, block_len);
  416. data_begin -= block_len;
  417. data_end -= block_len;
  418. if (reader.is_valid())
  419. {
  420. data_end += reader.finish_read();
  421. std::memcpy(buffer.get() + block_len, buffer.get() + 2 * block_len,
  422. block_len);
  423. reader.start_read(buffer.get() + 2 * block_len, block_len);
  424. }
  425. }
  426. int line_end = data_begin;
  427. while (line_end != data_end && buffer[line_end] != '\n')
  428. {
  429. ++line_end;
  430. }
  431. if (line_end - data_begin + 1 > block_len)
  432. {
  433. error::line_length_limit_exceeded err;
  434. err.set_file_name(file_name);
  435. err.set_file_line(file_line);
  436. throw err;
  437. }
  438. if (line_end != data_end && buffer[line_end] == '\n')
  439. {
  440. buffer[line_end] = '\0';
  441. }
  442. else
  443. {
  444. // some files are missing the newline at the end of the
  445. // last line
  446. ++data_end;
  447. buffer[line_end] = '\0';
  448. }
  449. // handle windows \r\n-line breaks
  450. if (line_end != data_begin && buffer[line_end - 1] == '\r')
  451. buffer[line_end - 1] = '\0';
  452. char *ret = buffer.get() + data_begin;
  453. data_begin = line_end + 1;
  454. return ret;
  455. }
  456. };
  457. ////////////////////////////////////////////////////////////////////////////
  458. // CSV //
  459. ////////////////////////////////////////////////////////////////////////////
  460. namespace error
  461. {
  462. const int max_column_name_length = 63;
  463. struct with_column_name
  464. {
  465. with_column_name()
  466. {
  467. std::memset(column_name, 0, max_column_name_length + 1);
  468. }
  469. void set_column_name(const char *column_name)
  470. {
  471. if (column_name != nullptr)
  472. {
  473. std::strncpy(this->column_name, column_name, max_column_name_length);
  474. this->column_name[max_column_name_length] = '\0';
  475. }
  476. else
  477. {
  478. this->column_name[0] = '\0';
  479. }
  480. }
  481. char column_name[max_column_name_length + 1];
  482. };
  483. const int max_column_content_length = 63;
  484. struct with_column_content
  485. {
  486. with_column_content()
  487. {
  488. std::memset(column_content, 0, max_column_content_length + 1);
  489. }
  490. void set_column_content(const char *column_content)
  491. {
  492. if (column_content != nullptr)
  493. {
  494. std::strncpy(this->column_content, column_content,
  495. max_column_content_length);
  496. this->column_content[max_column_content_length] = '\0';
  497. }
  498. else
  499. {
  500. this->column_content[0] = '\0';
  501. }
  502. }
  503. char column_content[max_column_content_length + 1];
  504. };
  505. struct extra_column_in_header : base, with_file_name, with_column_name
  506. {
  507. void format_error_message() const override
  508. {
  509. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  510. R"(Extra column "%s" in header of file "%s".)", column_name,
  511. file_name);
  512. }
  513. };
  514. struct missing_column_in_header : base, with_file_name, with_column_name
  515. {
  516. void format_error_message() const override
  517. {
  518. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  519. R"(Missing column "%s" in header of file "%s".)", column_name,
  520. file_name);
  521. }
  522. };
  523. struct duplicated_column_in_header : base, with_file_name, with_column_name
  524. {
  525. void format_error_message() const override
  526. {
  527. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  528. R"(Duplicated column "%s" in header of file "%s".)",
  529. column_name, file_name);
  530. }
  531. };
  532. struct header_missing : base, with_file_name
  533. {
  534. void format_error_message() const override
  535. {
  536. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  537. "Header missing in file \"%s\".", file_name);
  538. }
  539. };
  540. struct too_few_columns : base, with_file_name, with_file_line
  541. {
  542. void format_error_message() const override
  543. {
  544. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  545. "Too few columns in line %d in file \"%s\".", file_line,
  546. file_name);
  547. }
  548. };
  549. struct too_many_columns : base, with_file_name, with_file_line
  550. {
  551. void format_error_message() const override
  552. {
  553. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  554. "Too many columns in line %d in file \"%s\".", file_line,
  555. file_name);
  556. }
  557. };
  558. struct escaped_string_not_closed : base, with_file_name, with_file_line
  559. {
  560. void format_error_message() const override
  561. {
  562. std::snprintf(error_message_buffer, sizeof(error_message_buffer),
  563. "Escaped string was not closed in line %d in file \"%s\".",
  564. file_line, file_name);
  565. }
  566. };
  567. struct integer_must_be_positive : base,
  568. with_file_name,
  569. with_file_line,
  570. with_column_name,
  571. with_column_content
  572. {
  573. void format_error_message() const override
  574. {
  575. std::snprintf(
  576. error_message_buffer, sizeof(error_message_buffer),
  577. R"(The integer "%s" must be positive or 0 in column "%s" in file "%s" in line "%d".)",
  578. column_content, column_name, file_name, file_line);
  579. }
  580. };
  581. struct no_digit : base,
  582. with_file_name,
  583. with_file_line,
  584. with_column_name,
  585. with_column_content
  586. {
  587. void format_error_message() const override
  588. {
  589. std::snprintf(
  590. error_message_buffer, sizeof(error_message_buffer),
  591. R"(The integer "%s" contains an invalid digit in column "%s" in file "%s" in line "%d".)",
  592. column_content, column_name, file_name, file_line);
  593. }
  594. };
  595. struct integer_overflow : base,
  596. with_file_name,
  597. with_file_line,
  598. with_column_name,
  599. with_column_content
  600. {
  601. void format_error_message() const override
  602. {
  603. std::snprintf(
  604. error_message_buffer, sizeof(error_message_buffer),
  605. R"(The integer "%s" overflows in column "%s" in file "%s" in line "%d".)",
  606. column_content, column_name, file_name, file_line);
  607. }
  608. };
  609. struct integer_underflow : base,
  610. with_file_name,
  611. with_file_line,
  612. with_column_name,
  613. with_column_content
  614. {
  615. void format_error_message() const override
  616. {
  617. std::snprintf(
  618. error_message_buffer, sizeof(error_message_buffer),
  619. R"(The integer "%s" underflows in column "%s" in file "%s" in line "%d".)",
  620. column_content, column_name, file_name, file_line);
  621. }
  622. };
  623. struct invalid_single_character : base,
  624. with_file_name,
  625. with_file_line,
  626. with_column_name,
  627. with_column_content
  628. {
  629. void format_error_message() const override
  630. {
  631. std::snprintf(
  632. error_message_buffer, sizeof(error_message_buffer),
  633. R"(The content "%s" of column "%s" in file "%s" in line "%d" is not a single character.)",
  634. column_content, column_name, file_name, file_line);
  635. }
  636. };
  637. } // namespace error
  638. using ignore_column = unsigned int;
  639. static const ignore_column ignore_no_column = 0;
  640. static const ignore_column ignore_extra_column = 1;
  641. static const ignore_column ignore_missing_column = 2;
  642. template <char... trim_char_list>
  643. struct trim_chars
  644. {
  645. private:
  646. constexpr static bool is_trim_char(char) { return false; }
  647. template <class... OtherTrimChars>
  648. constexpr static bool is_trim_char(char c, char trim_char,
  649. OtherTrimChars... other_trim_chars)
  650. {
  651. return c == trim_char || is_trim_char(c, other_trim_chars...);
  652. }
  653. public:
  654. static void trim(char *&str_begin, char *&str_end)
  655. {
  656. while (str_begin != str_end && is_trim_char(*str_begin, trim_char_list...))
  657. ++str_begin;
  658. while (str_begin != str_end &&
  659. is_trim_char(*(str_end - 1), trim_char_list...))
  660. --str_end;
  661. *str_end = '\0';
  662. }
  663. };
  664. struct no_comment
  665. {
  666. static bool is_comment(const char *) { return false; }
  667. };
  668. template <char... comment_start_char_list>
  669. struct single_line_comment
  670. {
  671. private:
  672. constexpr static bool is_comment_start_char(char) { return false; }
  673. template <class... OtherCommentStartChars>
  674. constexpr static bool
  675. is_comment_start_char(char c, char comment_start_char,
  676. OtherCommentStartChars... other_comment_start_chars)
  677. {
  678. return c == comment_start_char ||
  679. is_comment_start_char(c, other_comment_start_chars...);
  680. }
  681. public:
  682. static bool is_comment(const char *line)
  683. {
  684. return is_comment_start_char(*line, comment_start_char_list...);
  685. }
  686. };
  687. struct empty_line_comment
  688. {
  689. static bool is_comment(const char *line)
  690. {
  691. if (*line == '\0')
  692. return true;
  693. while (*line == ' ' || *line == '\t')
  694. {
  695. ++line;
  696. if (*line == 0)
  697. return true;
  698. }
  699. return false;
  700. }
  701. };
  702. template <char... comment_start_char_list>
  703. struct single_and_empty_line_comment
  704. {
  705. static bool is_comment(const char *line)
  706. {
  707. return single_line_comment<comment_start_char_list...>::is_comment(line) ||
  708. empty_line_comment::is_comment(line);
  709. }
  710. };
  711. template <char sep>
  712. struct no_quote_escape
  713. {
  714. static const char *find_next_column_end(const char *col_begin)
  715. {
  716. while (*col_begin != sep && *col_begin != '\0')
  717. ++col_begin;
  718. return col_begin;
  719. }
  720. static void unescape(char *&, char *&) {}
  721. };
  722. template <char sep, char quote>
  723. struct double_quote_escape
  724. {
  725. static const char *find_next_column_end(const char *col_begin)
  726. {
  727. while (*col_begin != sep && *col_begin != '\0')
  728. if (*col_begin != quote)
  729. ++col_begin;
  730. else
  731. {
  732. do
  733. {
  734. ++col_begin;
  735. while (*col_begin != quote)
  736. {
  737. if (*col_begin == '\0')
  738. throw error::escaped_string_not_closed();
  739. ++col_begin;
  740. }
  741. ++col_begin;
  742. } while (*col_begin == quote);
  743. }
  744. return col_begin;
  745. }
  746. static void unescape(char *&col_begin, char *&col_end)
  747. {
  748. if (col_end - col_begin >= 2)
  749. {
  750. if (*col_begin == quote && *(col_end - 1) == quote)
  751. {
  752. ++col_begin;
  753. --col_end;
  754. char *out = col_begin;
  755. for (char *in = col_begin; in != col_end; ++in)
  756. {
  757. if (*in == quote && (in + 1) != col_end && *(in + 1) == quote)
  758. {
  759. ++in;
  760. }
  761. *out = *in;
  762. ++out;
  763. }
  764. col_end = out;
  765. *col_end = '\0';
  766. }
  767. }
  768. }
  769. };
  770. struct throw_on_overflow
  771. {
  772. template <class T>
  773. static void on_overflow(T &)
  774. {
  775. throw error::integer_overflow();
  776. }
  777. template <class T>
  778. static void on_underflow(T &)
  779. {
  780. throw error::integer_underflow();
  781. }
  782. };
  783. struct ignore_overflow
  784. {
  785. template <class T>
  786. static void on_overflow(T &) {}
  787. template <class T>
  788. static void on_underflow(T &) {}
  789. };
  790. struct set_to_max_on_overflow
  791. {
  792. template <class T>
  793. static void on_overflow(T &x)
  794. {
  795. // using (std::numeric_limits<T>::max) instead of
  796. // std::numeric_limits<T>::max to make code including windows.h with its max
  797. // macro happy
  798. x = (std::numeric_limits<T>::max)();
  799. }
  800. template <class T>
  801. static void on_underflow(T &x)
  802. {
  803. x = (std::numeric_limits<T>::min)();
  804. }
  805. };
  806. namespace detail
  807. {
  808. template <class quote_policy>
  809. void chop_next_column(char *&line, char *&col_begin, char *&col_end)
  810. {
  811. assert(line != nullptr);
  812. col_begin = line;
  813. // the col_begin + (... - col_begin) removes the constness
  814. col_end =
  815. col_begin + (quote_policy::find_next_column_end(col_begin) - col_begin);
  816. if (*col_end == '\0')
  817. {
  818. line = nullptr;
  819. }
  820. else
  821. {
  822. *col_end = '\0';
  823. line = col_end + 1;
  824. }
  825. }
  826. template <class trim_policy, class quote_policy>
  827. void parse_line(char *line, char **sorted_col,
  828. const std::vector<int> &col_order)
  829. {
  830. for (int i : col_order)
  831. {
  832. if (line == nullptr)
  833. throw ::io::error::too_few_columns();
  834. char *col_begin, *col_end;
  835. chop_next_column<quote_policy>(line, col_begin, col_end);
  836. if (i != -1)
  837. {
  838. trim_policy::trim(col_begin, col_end);
  839. quote_policy::unescape(col_begin, col_end);
  840. sorted_col[i] = col_begin;
  841. }
  842. }
  843. if (line != nullptr)
  844. throw ::io::error::too_many_columns();
  845. }
  846. template <unsigned column_count, class trim_policy, class quote_policy>
  847. void parse_header_line(char *line, std::vector<int> &col_order,
  848. const std::string *col_name,
  849. ignore_column ignore_policy)
  850. {
  851. col_order.clear();
  852. bool found[column_count];
  853. std::fill(found, found + column_count, false);
  854. while (line)
  855. {
  856. char *col_begin, *col_end;
  857. chop_next_column<quote_policy>(line, col_begin, col_end);
  858. trim_policy::trim(col_begin, col_end);
  859. quote_policy::unescape(col_begin, col_end);
  860. for (unsigned i = 0; i < column_count; ++i)
  861. if (col_begin == col_name[i])
  862. {
  863. if (found[i])
  864. {
  865. error::duplicated_column_in_header err;
  866. err.set_column_name(col_begin);
  867. throw err;
  868. }
  869. found[i] = true;
  870. col_order.push_back(i);
  871. col_begin = 0;
  872. break;
  873. }
  874. if (col_begin)
  875. {
  876. if (ignore_policy & ::io::ignore_extra_column)
  877. col_order.push_back(-1);
  878. else
  879. {
  880. error::extra_column_in_header err;
  881. err.set_column_name(col_begin);
  882. throw err;
  883. }
  884. }
  885. }
  886. if (!(ignore_policy & ::io::ignore_missing_column))
  887. {
  888. for (unsigned i = 0; i < column_count; ++i)
  889. {
  890. if (!found[i])
  891. {
  892. error::missing_column_in_header err;
  893. err.set_column_name(col_name[i].c_str());
  894. throw err;
  895. }
  896. }
  897. }
  898. }
  899. template <class overflow_policy>
  900. void parse(char *col, char &x)
  901. {
  902. if (!*col)
  903. throw error::invalid_single_character();
  904. x = *col;
  905. ++col;
  906. if (*col)
  907. throw error::invalid_single_character();
  908. }
  909. template <class overflow_policy>
  910. void parse(char *col, std::string &x)
  911. {
  912. x = col;
  913. }
  914. template <class overflow_policy>
  915. void parse(char *col, const char *&x)
  916. {
  917. x = col;
  918. }
  919. template <class overflow_policy>
  920. void parse(char *col, char *&x) { x = col; }
  921. template <class overflow_policy, class T>
  922. void parse_unsigned_integer(const char *col, T &x)
  923. {
  924. x = 0;
  925. while (*col != '\0')
  926. {
  927. if ('0' <= *col && *col <= '9')
  928. {
  929. T y = *col - '0';
  930. if (x > ((std::numeric_limits<T>::max)() - y) / 10)
  931. {
  932. overflow_policy::on_overflow(x);
  933. return;
  934. }
  935. x = 10 * x + y;
  936. }
  937. else
  938. throw error::no_digit();
  939. ++col;
  940. }
  941. }
  942. template <class overflow_policy>
  943. void parse(char *col, unsigned char &x)
  944. {
  945. parse_unsigned_integer<overflow_policy>(col, x);
  946. }
  947. template <class overflow_policy>
  948. void parse(char *col, unsigned short &x)
  949. {
  950. parse_unsigned_integer<overflow_policy>(col, x);
  951. }
  952. template <class overflow_policy>
  953. void parse(char *col, unsigned int &x)
  954. {
  955. parse_unsigned_integer<overflow_policy>(col, x);
  956. }
  957. template <class overflow_policy>
  958. void parse(char *col, unsigned long &x)
  959. {
  960. parse_unsigned_integer<overflow_policy>(col, x);
  961. }
  962. template <class overflow_policy>
  963. void parse(char *col, unsigned long long &x)
  964. {
  965. parse_unsigned_integer<overflow_policy>(col, x);
  966. }
  967. template <class overflow_policy, class T>
  968. void parse_signed_integer(const char *col, T &x)
  969. {
  970. if (*col == '-')
  971. {
  972. ++col;
  973. x = 0;
  974. while (*col != '\0')
  975. {
  976. if ('0' <= *col && *col <= '9')
  977. {
  978. T y = *col - '0';
  979. if (x < ((std::numeric_limits<T>::min)() + y) / 10)
  980. {
  981. overflow_policy::on_underflow(x);
  982. return;
  983. }
  984. x = 10 * x - y;
  985. }
  986. else
  987. throw error::no_digit();
  988. ++col;
  989. }
  990. return;
  991. }
  992. else if (*col == '+')
  993. ++col;
  994. parse_unsigned_integer<overflow_policy>(col, x);
  995. }
  996. template <class overflow_policy>
  997. void parse(char *col, signed char &x)
  998. {
  999. parse_signed_integer<overflow_policy>(col, x);
  1000. }
  1001. template <class overflow_policy>
  1002. void parse(char *col, signed short &x)
  1003. {
  1004. parse_signed_integer<overflow_policy>(col, x);
  1005. }
  1006. template <class overflow_policy>
  1007. void parse(char *col, signed int &x)
  1008. {
  1009. parse_signed_integer<overflow_policy>(col, x);
  1010. }
  1011. template <class overflow_policy>
  1012. void parse(char *col, signed long &x)
  1013. {
  1014. parse_signed_integer<overflow_policy>(col, x);
  1015. }
  1016. template <class overflow_policy>
  1017. void parse(char *col, signed long long &x)
  1018. {
  1019. parse_signed_integer<overflow_policy>(col, x);
  1020. }
  1021. template <class T>
  1022. void parse_float(const char *col, T &x)
  1023. {
  1024. bool is_neg = false;
  1025. if (*col == '-')
  1026. {
  1027. is_neg = true;
  1028. ++col;
  1029. }
  1030. else if (*col == '+')
  1031. ++col;
  1032. x = 0;
  1033. while ('0' <= *col && *col <= '9')
  1034. {
  1035. int y = *col - '0';
  1036. x *= 10;
  1037. x += y;
  1038. ++col;
  1039. }
  1040. if (*col == '.' || *col == ',')
  1041. {
  1042. ++col;
  1043. T pos = 1;
  1044. while ('0' <= *col && *col <= '9')
  1045. {
  1046. pos /= 10;
  1047. int y = *col - '0';
  1048. ++col;
  1049. x += y * pos;
  1050. }
  1051. }
  1052. if (*col == 'e' || *col == 'E')
  1053. {
  1054. ++col;
  1055. int e;
  1056. parse_signed_integer<set_to_max_on_overflow>(col, e);
  1057. if (e != 0)
  1058. {
  1059. T base;
  1060. if (e < 0)
  1061. {
  1062. base = T(0.1);
  1063. e = -e;
  1064. }
  1065. else
  1066. {
  1067. base = T(10);
  1068. }
  1069. while (e != 1)
  1070. {
  1071. if ((e & 1) == 0)
  1072. {
  1073. base = base * base;
  1074. e >>= 1;
  1075. }
  1076. else
  1077. {
  1078. x *= base;
  1079. --e;
  1080. }
  1081. }
  1082. x *= base;
  1083. }
  1084. }
  1085. else
  1086. {
  1087. if (*col != '\0')
  1088. throw error::no_digit();
  1089. }
  1090. if (is_neg)
  1091. x = -x;
  1092. }
  1093. template <class overflow_policy>
  1094. void parse(char *col, float &x)
  1095. {
  1096. parse_float(col, x);
  1097. }
  1098. template <class overflow_policy>
  1099. void parse(char *col, double &x)
  1100. {
  1101. parse_float(col, x);
  1102. }
  1103. template <class overflow_policy>
  1104. void parse(char *col, long double &x)
  1105. {
  1106. parse_float(col, x);
  1107. }
  1108. template <class overflow_policy, class T>
  1109. void parse(char *col, T &x)
  1110. {
  1111. // Mute unused variable compiler warning
  1112. (void)col;
  1113. (void)x;
  1114. // GCC evaluates "false" when reading the template and
  1115. // "sizeof(T)!=sizeof(T)" only when instantiating it. This is why
  1116. // this strange construct is used.
  1117. static_assert(sizeof(T) != sizeof(T),
  1118. "Can not parse this type. Only builtin integrals, floats, "
  1119. "char, char*, const char* and std::string are supported");
  1120. }
  1121. } // namespace detail
  1122. template <unsigned column_count, class trim_policy = trim_chars<' ', '\t'>,
  1123. class quote_policy = no_quote_escape<','>,
  1124. class overflow_policy = throw_on_overflow,
  1125. class comment_policy = no_comment>
  1126. class CSVReader
  1127. {
  1128. private:
  1129. LineReader in;
  1130. char *row[column_count];
  1131. std::string column_names[column_count];
  1132. std::vector<int> col_order;
  1133. template <class... ColNames>
  1134. void set_column_names(std::string s, ColNames... cols)
  1135. {
  1136. column_names[column_count - sizeof...(ColNames) - 1] = std::move(s);
  1137. set_column_names(std::forward<ColNames>(cols)...);
  1138. }
  1139. void set_column_names() {}
  1140. public:
  1141. CSVReader() = delete;
  1142. CSVReader(const CSVReader &) = delete;
  1143. CSVReader &operator=(const CSVReader &);
  1144. template <class... Args>
  1145. explicit CSVReader(Args &&...args) : in(std::forward<Args>(args)...)
  1146. {
  1147. std::fill(row, row + column_count, nullptr);
  1148. col_order.resize(column_count);
  1149. for (unsigned i = 0; i < column_count; ++i)
  1150. col_order[i] = i;
  1151. for (unsigned i = 1; i <= column_count; ++i)
  1152. column_names[i - 1] = "col" + std::to_string(i);
  1153. }
  1154. char *next_line() { return in.next_line(); }
  1155. template <class... ColNames>
  1156. void read_header(ignore_column ignore_policy, ColNames... cols)
  1157. {
  1158. static_assert(sizeof...(ColNames) >= column_count,
  1159. "not enough column names specified");
  1160. static_assert(sizeof...(ColNames) <= column_count,
  1161. "too many column names specified");
  1162. try
  1163. {
  1164. set_column_names(std::forward<ColNames>(cols)...);
  1165. char *line;
  1166. do
  1167. {
  1168. line = in.next_line();
  1169. if (!line)
  1170. throw error::header_missing();
  1171. } while (comment_policy::is_comment(line));
  1172. detail::parse_header_line<column_count, trim_policy, quote_policy>(
  1173. line, col_order, column_names, ignore_policy);
  1174. }
  1175. catch (error::with_file_name &err)
  1176. {
  1177. err.set_file_name(in.get_truncated_file_name());
  1178. throw;
  1179. }
  1180. }
  1181. template <class... ColNames>
  1182. void set_header(ColNames... cols)
  1183. {
  1184. static_assert(sizeof...(ColNames) >= column_count,
  1185. "not enough column names specified");
  1186. static_assert(sizeof...(ColNames) <= column_count,
  1187. "too many column names specified");
  1188. set_column_names(std::forward<ColNames>(cols)...);
  1189. std::fill(row, row + column_count, nullptr);
  1190. col_order.resize(column_count);
  1191. for (unsigned i = 0; i < column_count; ++i)
  1192. col_order[i] = i;
  1193. }
  1194. bool has_column(const std::string &name) const
  1195. {
  1196. return col_order.end() !=
  1197. std::find(col_order.begin(), col_order.end(),
  1198. std::find(std::begin(column_names), std::end(column_names),
  1199. name) -
  1200. std::begin(column_names));
  1201. }
  1202. void set_file_name(const std::string &file_name)
  1203. {
  1204. in.set_file_name(file_name);
  1205. }
  1206. void set_file_name(const char *file_name) { in.set_file_name(file_name); }
  1207. const char *get_truncated_file_name() const
  1208. {
  1209. return in.get_truncated_file_name();
  1210. }
  1211. void set_file_line(unsigned file_line) { in.set_file_line(file_line); }
  1212. unsigned get_file_line() const { return in.get_file_line(); }
  1213. private:
  1214. void parse_helper(std::size_t) {}
  1215. template <class T, class... ColType>
  1216. void parse_helper(std::size_t r, T &t, ColType &...cols)
  1217. {
  1218. if (row[r])
  1219. {
  1220. try
  1221. {
  1222. try
  1223. {
  1224. ::io::detail::parse<overflow_policy>(row[r], t);
  1225. }
  1226. catch (error::with_column_content &err)
  1227. {
  1228. err.set_column_content(row[r]);
  1229. throw;
  1230. }
  1231. }
  1232. catch (error::with_column_name &err)
  1233. {
  1234. err.set_column_name(column_names[r].c_str());
  1235. throw;
  1236. }
  1237. }
  1238. parse_helper(r + 1, cols...);
  1239. }
  1240. public:
  1241. template <class... ColType>
  1242. bool read_row(ColType &...cols)
  1243. {
  1244. static_assert(sizeof...(ColType) >= column_count,
  1245. "not enough columns specified");
  1246. static_assert(sizeof...(ColType) <= column_count,
  1247. "too many columns specified");
  1248. try
  1249. {
  1250. try
  1251. {
  1252. char *line;
  1253. do
  1254. {
  1255. line = in.next_line();
  1256. if (!line)
  1257. return false;
  1258. } while (comment_policy::is_comment(line));
  1259. detail::parse_line<trim_policy, quote_policy>(line, row, col_order);
  1260. parse_helper(0, cols...);
  1261. }
  1262. catch (error::with_file_name &err)
  1263. {
  1264. err.set_file_name(in.get_truncated_file_name());
  1265. throw;
  1266. }
  1267. }
  1268. catch (error::with_file_line &err)
  1269. {
  1270. err.set_file_line(in.get_file_line());
  1271. throw;
  1272. }
  1273. return true;
  1274. }
  1275. };
  1276. // CSVWriter
  1277. class LineWriter
  1278. {
  1279. private:
  1280. std::ofstream file;
  1281. public:
  1282. LineWriter(const std::string &filename)
  1283. {
  1284. file.open(filename);
  1285. if (!file.is_open())
  1286. {
  1287. throw std::runtime_error("Unable to open file");
  1288. }
  1289. }
  1290. ~LineWriter()
  1291. {
  1292. if (file.is_open())
  1293. {
  1294. file.close();
  1295. }
  1296. }
  1297. void write(const std::string &line)
  1298. {
  1299. file << line;
  1300. }
  1301. };
  1302. // CSVWriter 的实现
  1303. template <unsigned column_count,
  1304. class trim_policy = std::string, // 使用默认值 ' ' 和 '\t'
  1305. class quote_policy = std::string, // 在这里我们可以简单化
  1306. class overflow_policy = std::runtime_error>
  1307. class CSVWriter
  1308. {
  1309. private:
  1310. LineWriter out;
  1311. std::string column_names[column_count];
  1312. void write_header()
  1313. {
  1314. for (unsigned i = 0; i < column_count; ++i)
  1315. {
  1316. if (i > 0)
  1317. out.write(",");
  1318. out.write(column_names[i]);
  1319. }
  1320. out.write("\n");
  1321. }
  1322. public:
  1323. CSVWriter(const std::string &filename, const std::initializer_list<std::string> &cols)
  1324. : out(filename)
  1325. {
  1326. // 使用初始化列表设置列名
  1327. unsigned index = 0;
  1328. for (const auto &col : cols)
  1329. {
  1330. if (index < column_count)
  1331. {
  1332. column_names[index++] = col;
  1333. }
  1334. else
  1335. {
  1336. throw overflow_policy("Too many column names specified");
  1337. }
  1338. }
  1339. write_header(); // 写入头部
  1340. }
  1341. template <class... ColType>
  1342. void write_row(ColType... cols)
  1343. {
  1344. write_row_helper(cols...);
  1345. out.write("\n");
  1346. }
  1347. private:
  1348. template <typename T>
  1349. void write_value(const T &value)
  1350. {
  1351. out.write(std::to_string(value)); // 简化,不进行引用处理
  1352. }
  1353. template <typename T, typename... Rest>
  1354. void write_row_helper(const T &value, const Rest &...rest)
  1355. {
  1356. write_value(value);
  1357. if constexpr (sizeof...(rest) > 0)
  1358. {
  1359. out.write(",");
  1360. write_row_helper(rest...);
  1361. }
  1362. }
  1363. };
  1364. } // namespace io
  1365. #endif