diff --git a/include/rosa/support/csv/CSVReader.hpp b/include/rosa/support/csv/CSVReader.hpp index 2d65d1c..d21cf07 100755 --- a/include/rosa/support/csv/CSVReader.hpp +++ b/include/rosa/support/csv/CSVReader.hpp @@ -1,782 +1,829 @@ //===-- rosa/support/csv/CSVReader.hpp --------------------------*- C++ -*-===// // // The RoSA Framework // //===----------------------------------------------------------------------===// /// /// \file rosa/support/csv/CSVReader.hpp /// /// \authors David Juhasz (david.juhasz@tuwien.ac.at), Edwin Willegger (edwin.willegger@tuwien.ac.at) /// /// \date 2017-2019 /// /// \brief Facitilities to read CSV files. /// /// \note The implementation is based on the solution at /// https://stackoverflow.com/a/1120224 /// //===----------------------------------------------------------------------===// #ifndef ROSA_SUPPORT_CSV_CSVREADER_HPP #define ROSA_SUPPORT_CSV_CSVREADER_HPP #include "rosa/support/debug.hpp" #include #include #include #include namespace rosa { namespace csv { /// Indicating it the CSV file contains any header or not enum class HeaderInformation { HasHeader, HasNoHeader }; /// Anonymous namespace providing implementation details for /// \c rosa::csv::CSVIterator, consider it private. namespace { /// Provides facility for parsing values from one row CSV data. /// /// \tparam T type of values to parse from the line /// \tparam IsSignedInt if \p T is a signed integral type, always use default /// \tparam IsUnsignedInt if \p T is an unsigned integral type, always use /// default /// \tparam IsFloat if \p T is a floating-point type, always use default /// \tparam IsString if \p T is \c std::string, always use default /// /// \note Specializations of this `struct` are provided for arithmentic types /// and \c std::string. template ::value && std::is_signed::value), bool IsUnsignedInt = (std::is_integral::value && std::is_unsigned::value), bool IsFloat = std::is_floating_point::value, bool IsString = std::is_same::value> struct CSVRowParser; /// Specialization for signed integral types. /// /// \tparam T type of values to parse from the line /// /// \pre \p T is a signed integral type:\code /// std::is_integral::value && std::is_signed::value /// \endcode template struct CSVRowParser { STATIC_ASSERT((std::is_integral::value && std::is_signed::value), "wrong type"); // Sanity check. /// Parses a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parse(std::stringstream &LineStream, std::vector &Data, char Delimeter = ',') { std::string Cell; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { Data.push_back(static_cast(std::stoll(Cell))); } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } /// Parses a given column of a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parseValue(std::stringstream &LineStream, std::vector &Data, size_t Column = 0, char Delimeter = ',') { std::string Cell; size_t currentColumn = 0; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { if(currentColumn == Column){ Data.push_back(static_cast(std::stoll(Cell))); break; } currentColumn = currentColumn + 1; } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } }; /// Specialization for unsigned integral types. /// /// \tparam T type of values to parse from the line /// /// \pre \p T is an unsigned integral type:\code /// std::is_integral::value && std::is_unsigned::value /// \endcode template struct CSVRowParser { STATIC_ASSERT((std::is_integral::value && std::is_unsigned::value), "wrong type"); // Sanity check. /// Parses a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parse(std::stringstream &LineStream, std::vector &Data, char Delimeter = ',') { std::string Cell; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { Data.push_back(static_cast(std::stoull(Cell))); } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } /// Parses a given column of a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parseValue(std::stringstream &LineStream, std::vector &Data, size_t Column = 0, char Delimeter = ',') { std::string Cell; size_t currentColumn = 0; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { if(currentColumn == Column){ Data.push_back(static_cast(std::stoll(Cell))); break; } currentColumn = currentColumn + 1; } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } }; /// Specialization for floating-point types. /// /// \tparam T type of values to parse from the line /// /// \pre \p T is a floating-point type:\code /// std::is_floating_point::value /// \endcode template struct CSVRowParser { STATIC_ASSERT((std::is_floating_point::value), "wrong type"); // Sanity check. /// Parses a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parse(std::stringstream &LineStream, std::vector &Data, char Delimeter = ',') { std::string Cell; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { Data.push_back(static_cast(std::stold(Cell))); } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } /// Parses a given column of a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parseValue(std::stringstream &LineStream, std::vector &Data, size_t Column = 0, char Delimeter = ',') { std::string Cell; size_t currentColumn = 0; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { if(currentColumn == Column){ Data.push_back(static_cast(std::stold(Cell))); break; } currentColumn = currentColumn + 1; } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } }; /// Specialization for \c std::string. /// /// \tparam T type of values to parse from the line /// /// \pre \p T is \c std::string:\code /// std::is_same::value /// \endcode template struct CSVRowParser { STATIC_ASSERT((std::is_same::value), "wrong type"); // Sanity check. /// Parses a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parse(std::stringstream &LineStream, std::vector &Data, char Delimeter = ',') { std::string Cell; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { Data.push_back(Cell); } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(""); } } /// Parses a given column of a given row of CSV data into a given container. /// /// \p Data is cleared and then filled with values parsed from \p LineStream. /// Entries in the line are to be separated by commas, the character `,`. A /// trailing comma results in an empty entry at the end of the line. No empty /// entry should be present otherwise. /// /// \note Parsed values are silently converted to type \p T. /// /// \param [in,out] LineStream the line to parse /// \param [in,out] Data the container to store the parsed values static void parseValue(std::stringstream &LineStream, std::vector &Data, size_t Column = 0, char Delimeter = ',') { std::string Cell; size_t currentColumn = 0; Data.clear(); while (std::getline(LineStream, Cell, Delimeter)) { if(currentColumn == Column){ Data.push_back(static_cast(std::stoll(Cell))); break; } currentColumn = currentColumn + 1; } // This checks for a trailing comma with no data after it. if (!LineStream && Cell.empty()) { // If there was a trailing comma then add an empty element. Data.push_back(0); } } }; /// Parses and stores entries from a row of CSV data. /// /// \tparam T type of values to parse and store, i.e. entries in the row /// /// \note The implementation relies on \c rosa::csv::CSVRowParser, which is /// implemented only for `arithmetic` types -- signed and unsigned integral and /// floating-point types -- and for \c std::string. Those are the valid values /// for \p T. template class CSVRow { public: CSVRow() : isHeaderRead(false), Delimeter(','), EndOfLine(','), Column(1){} /// Gives a constant reference for an entry at a given position of the row. /// /// \note No bounds checking is performed. /// /// \param Index the position of the entry /// /// \return constant reference for the stored entry at position \p Index const T &operator[](const size_t Index) const noexcept { return Data[Index]; } /// Tells the number of entries stored in the row. /// /// \return number of stored entries. size_t size(void) const noexcept { return Data.size(); } /// Parses and stores one row of CSV data. /// /// The function reads one line from \p Str and parses it into /// \c rosa::csv::CSVRow::Data using \c rosa::csv::CSVRowParser. /// /// \param [in,out] Str input stream of a CSV file void readNextRow(std::istream &Str) { std::string Line; std::getline(Str, Line); std::stringstream LineStream(Line); + RowNumber = RowNumber + 1; CSVRowParser::parse(LineStream, Data, Delimeter); } bool isNumeric(const std::string& input){ return std::all_of(input.begin(), input.end(), ::isdigit); } void readHeaderRow(std::istream &Str){ std::string Line; std::getline(Str, Line); std::stringstream LineStream(Line); CSVRowParser::parse(LineStream, Header, Delimeter); + RowNumber = RowNumber + 1; isHeaderRead = true; } inline bool isHeaderAlreadyRead(){ return isHeaderRead; } HeaderInformation isHeaderSet(){ return HeaderInfo; } inline void setDelimeter(char Delimeter){ this->Delimeter = Delimeter; } inline char getDelimeter(){ return this->Delimeter; } inline void setEndOfLine(char EndOfLine){ this->EndOfLine = EndOfLine; } inline char getEndOfLine(){ return this->EndOfLine; } inline bool isThisFirstRow(){ return this->isFirstRow; } inline void setColumn(const size_t & Column){ this->Column = Column; } inline void setHeaderInfo(const HeaderInformation HeaderInfo){ this->HeaderInfo = HeaderInfo; } inline void setSkipRows(const size_t &SkipRows){ this->SkipRows = SkipRows; } + inline const size_t & getSkipRows(){ + return this->SkipRows; + } + + inline uint64_t getRowNumber(){ + return this->RowNumber; + } + private: std::vector Data; ///< Stores parsed entries uint64_t RowNumber; ///< Current row number std::vector Header; ///< Stores the header entries if available char Delimeter; ///< Stores the delimeter between data entries char EndOfLine; ///< Stores the end of line character size_t Column; ///< Stores the column to get the data out of the row HeaderInformation HeaderInfo; ///< Indicates if CSV file contains a header row (expected first row to be the header). size_t SkipRows; ///< Number of Rows to skip at the beginning of the file. bool isHeaderRead; ///< Indicates if header was read }; /// Parses and stores entries from a row of CSV data. /// It parses an entire row and takes only the value of the corresponding column. /// /// \tparam T type of values to parse and store, i.e. entries in the row /// /// \note The implementation relies on \c rosa::csv::CSVRowParser, which is /// implemented only for `arithmetic` types -- signed and unsigned integral and /// floating-point types -- and for \c std::string. Those are the valid values /// for \p T. template class CSVValue { public: CSVValue() : isHeaderRead(false), Delimeter(','), - EndOfLine(','), Column(0) { } + EndOfLine(','), Column(0), RowNumber(0) { } /// Gives a constant reference for an entry at a given position of the row. /// /// \note No bounds checking is performed. /// /// \param Index the position of the entry /// /// \return constant reference for the stored entry at position \p Index const T &operator[](const size_t Index) const noexcept { return Data[Index]; } /// Tells the number of entries stored in the row. /// /// \return number of stored entries. size_t size(void) const noexcept { return Data.size(); } /// Parses and stores one row of CSV data. /// /// The function reads one line from \p Str and parses it into /// \c rosa::csv::CSVRow::Data using \c rosa::csv::CSVRowParser. /// /// \param [in,out] Str input stream of a CSV file void readNextValue(std::istream &Str) { std::string Line; std::getline(Str, Line); std::stringstream LineStream(Line); CSVRowParser::parseValue(LineStream, Data, Column, Delimeter); + + RowNumber = RowNumber + 1; } bool isNumeric(const std::string& input){ return std::all_of(input.begin(), input.end(), ::isdigit); } void readHeaderRow(std::istream &Str){ std::string Line; std::getline(Str, Line); std::stringstream LineStream(Line); CSVRowParser::parse(LineStream, Header, Delimeter); isHeaderRead = true; + + RowNumber = RowNumber + 1; } inline bool isHeaderAlreadyRead(){ return isHeaderRead; } inline HeaderInformation isHeaderSet(){ return HeaderInfo; } inline void setDelimeter(char Delimeter){ this->Delimeter = Delimeter; } inline char getDelimeter(){ return this->Delimeter; } inline void setEndOfLine(char EndOfLine){ this->EndOfLine = EndOfLine; } inline char getEndOfLine(){ return this->EndOfLine; } inline bool isThisFirstRow(){ return this->isFirstRow; } inline void setColumn(const size_t &Column){ this->Column = Column; } inline void setHeaderInfo(const HeaderInformation HeaderInfo){ this->HeaderInfo = HeaderInfo; } inline void setSkipRows(const size_t &SkipRows){ this->SkipRows = SkipRows; } + inline const size_t & getSkipRows(){ + return this->SkipRows; + } + + inline uint64_t getRowNumber(){ + return this->RowNumber; + } + private: std::vector Data; ///< Stores parsed entries uint64_t RowNumber; ///< Current row number std::vector Header; ///< Stores the header entries if available char Delimeter; ///< Stores the delimeter between data entries char EndOfLine; ///< Stores the end of line character size_t Column; ///< Stores the column to get the data out of the row HeaderInformation HeaderInfo; ///< Indicates if CSV file contains a header row (expected first row to be the header). size_t SkipRows; ///< Number of Rows to skip at the beginning of the file. bool isHeaderRead; ///< Indicates if header was read }; /// Reads a row of CSV data into \c rosa::csv::CSVRow. /// /// The next line is read from \p Str by calling /// \c rosa::csv::CSVRow::readNextRow on \p Data. /// /// \note A CSV file should contain no empty lines. /// /// \param [in,out] Str input stream of a CSV file /// \param [in,out] Data object to read the next line into /// /// \return \p Str after reading one line from it template std::istream &operator>>(std::istream &Str, CSVRow &Data) { + size_t SkipRowsCorrected = 0; + if (Data.isHeaderSet() == HeaderInformation::HasHeader && !Data.isHeaderAlreadyRead()){ Data.readHeaderRow(Str); } + if (Data.isHeaderSet() == HeaderInformation::HasHeader){ + + } + + SkipRowsCorrected = Data.getSkipRows(); + if (Data.isHeaderSet() == HeaderInformation::HasHeader){ + SkipRowsCorrected = SkipRowsCorrected + 1; + } + + while (Data.getRowNumber() < SkipRowsCorrected){ + Data.readNextRow(Str); + } Data.readNextRow(Str); /* // just for debugging purpose char c; while(Str.get(c)){ std::cout << c; } std::cout << std::endl; */ return Str; } /// Reads a value of CSV data into \c rosa::csv::CSVValue. /// /// The next line is read from \p Str by calling /// \c rosa::csv::CSVValue::readNextValue on \p Data. /// If the file contains a header, the first row and the second row /// is read, so after the first read always valid data is available. /// /// \note A CSV file should contain no empty lines. /// /// \param [in,out] Str input stream of a CSV file /// \param [in,out] Data object to read the next line into /// /// \return \p Str after reading one line from it template std::istream &operator>>(std::istream &Str, CSVValue &Data) { + size_t SkipRowsCorrected = 0; + if (Data.isHeaderSet() == HeaderInformation::HasHeader && !Data.isHeaderAlreadyRead()){ Data.readHeaderRow(Str); } + SkipRowsCorrected = Data.getSkipRows(); + if (Data.isHeaderSet() == HeaderInformation::HasHeader){ + SkipRowsCorrected = SkipRowsCorrected + 1; + } + + while (Data.getRowNumber() < SkipRowsCorrected){ + Data.readNextValue(Str); + } + Data.readNextValue(Str); /* // just for debugging purpose char c; while(Str.get(c)){ std::cout << c; } std::cout << std::endl; */ return Str; } } // End namespace /// Provides `InputIterator` features for iterating over a CSV file in a /// flat way. /// /// The iterator hides rows of the CSV file, and iterates over the entries /// row-by-row. /// /// \note A CSV file should contain no empty lines. /// /// \tparam T type of values to iterate over, i.e. entries in the CSV file. /// /// \note The implementation relies on \c rosa::csv::CSVRow, which in turn /// relies on \c rosa::csv::CSVRowParser, which is implemented only for /// `arithmetic` types -- signed and unsigned integral types and floating-point /// types -- and for \c std::string. Those are the valid values for \p T. template class CSVFlatIterator { public: /// \defgroup CSVFlatIteratorTypedefs Typedefs of rosa::csv::CSVFlatIterator /// /// Standard `typedef`s for iterators. /// ///@{ typedef std::input_iterator_tag iterator_category; ///< Category of the iterator. typedef T value_type; ///< Type of values iterated over. typedef std::size_t difference_type; ///< Type to identify distance. typedef T *pointer; ///< Pointer to the type iterated over. typedef T &reference; ///< Reference to the type iterated over. ///@} /// Creates a new instance. /// /// \param [in,out] S input stream to iterate over CSVFlatIterator(std::istream &S, size_t Column = 0, HeaderInformation HeaderInfo = HeaderInformation::HasHeader, bool MultipleRow = true, size_t SkipRows = 0, const char Delimeter = ',', const char EndOfLine = '\n') : Str(S.good() ? &S : nullptr), Pos((size_t)(-1)), Column(Column), HeaderInfo(HeaderInfo), MultipleRow(MultipleRow), SkipRows(SkipRows), Delimeter(Delimeter), EndOfLine(EndOfLine) { Row.setHeaderInfo(HeaderInfo); Row.setSkipRows(SkipRows); Row.setDelimeter(Delimeter); Row.setEndOfLine(EndOfLine); Row.setColumn(Column); Value.setHeaderInfo(HeaderInfo); Value.setSkipRows(SkipRows); Value.setDelimeter(Delimeter); Value.setEndOfLine(EndOfLine); Value.setColumn(Column); // \c rosa::csv::CSVFlatIterator::Pos is initialized to `-1` so the first // incrementation here will set it properly. ++(*this); } /// Creates an empty new instance. CSVFlatIterator(void) noexcept : Str(nullptr), MultipleRow(true) {} /// Pre-increment operator. /// /// The implementation moves over the entries in the current row and advances /// to the next row when the end of the current row is reached. If the end of /// the input stream is reached, the operator becomes empty and has no /// further effect. /// /// \return \p this object after incrementing it. CSVFlatIterator &operator++() { if (Str) { ++Pos; if(MultipleRow){ if (Pos == Row.size()) { if (!((*Str) >> Row)) { Str = nullptr; --Pos; // Stay on the last entry forever. } else { Pos = 0; } } }else{ if (Pos == Value.size()) { if (!((*Str) >> Value)) { Str = nullptr; --Pos; // Stay on the last entry forever. } else { Pos = 0; } } } } return *this; } /// Post-increment operator. /// /// The implementation uses the pre-increment operator and returns a copy of /// the original state of \p this object. /// /// \return \p this object before incrementing it. CSVFlatIterator operator++(int) { CSVFlatIterator Tmp(*this); ++(*this); return Tmp; } /// Returns a constant reference to the current entry. /// /// \note Should not dereference the iterator when it is empty. /// /// \return constanStartt reference to the current entry. const T &operator*(void)const noexcept { if(MultipleRow){ return Row[Pos]; }else { return Value[Pos]; } } /// Returns a constant pointer to the current entry. /// /// \note Should not dereference the iterator when it is empty. /// /// \return constant pointer to the current entry. const T *operator->(void)const noexcept { if(MultipleRow){ return &Row[Pos]; }else { return &Value[Pos]; } } /// Tells if \p this object is equal to another one. /// /// Two \c rosa::csv::CSVReader instances are equal if and only if they are /// the same or both are empty. /// /// \param RHS other object to compare to /// /// \return whether \p this object is equal with \p RHS bool operator==(const CSVFlatIterator &RHS) const noexcept { return ((this == &RHS) || ((this->Str == nullptr) && (RHS.Str == nullptr))); } /// Tells if \p this object is not equal to another one. /// /// \see rosa::csv::CSVReader::operator== /// /// \param RHS other object to compare to /// /// \return whether \p this object is not equal with \p RHS. bool operator!=(const CSVFlatIterator &RHS) const noexcept { return !((*this) == RHS); } inline void setDelimeter(char Delimter){ this->Delimeter = Delimter; } inline char getDelimeter(){ return this->Delimeter; } private: std::istream *Str; ///< Input stream of a CSV file to iterate over. CSVRow Row; ///< Content of the current row iterating over. CSVValue Value; ///< Content of the specified column in the current row. size_t Pos; ///< Current position within the current row. char Delimeter; ///< Delimeter between the entries char EndOfLine; ///< stores the end of line character size_t Column; ///< Index of the column to get data out of it, starts at zero. HeaderInformation HeaderInfo; ///< Indicates if CSV file contains a header row (expected first row to be the header). bool MultipleRow; ///< Indicates if you want to read a row or only one column out of a row. true = use CSVRow size_t SkipRows; ///< Number of Rows to skip at the beginning of the file. }; } // End namespace csv } // End namespace rosa #endif // ROSA_SUPPORT_CSV_CSVREADER_HPP