// Licensed under a 3-clause BSD style license - see LICENSE.rst #include "tokenizer.h" tokenizer_t *create_tokenizer(char delimiter, char comment, char quotechar, char expchar, int fill_extra_cols, int strip_whitespace_lines, int strip_whitespace_fields, int use_fast_converter) { // Create the tokenizer in memory tokenizer_t *tokenizer = (tokenizer_t *) malloc(sizeof(tokenizer_t)); // Initialize the tokenizer fields tokenizer->source = NULL; tokenizer->source_len = 0; tokenizer->source_pos = 0; tokenizer->delimiter = delimiter; tokenizer->comment = comment; tokenizer->quotechar = quotechar; tokenizer->expchar = expchar; tokenizer->newline = '\n'; tokenizer->output_cols = NULL; tokenizer->col_ptrs = NULL; tokenizer->output_len = NULL; tokenizer->num_cols = 0; tokenizer->num_rows = 0; tokenizer->fill_extra_cols = fill_extra_cols; tokenizer->state = START_LINE; tokenizer->code = NO_ERROR; tokenizer->iter_col = 0; tokenizer->curr_pos = NULL; tokenizer->strip_whitespace_lines = strip_whitespace_lines; tokenizer->strip_whitespace_fields = strip_whitespace_fields; tokenizer->use_fast_converter = use_fast_converter; tokenizer->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN); tokenizer->comment_pos = 0; tokenizer->comment_lines_len = 0; // This is a bit of a hack -- buf holds an empty string to represent // empty field values tokenizer->buf = calloc(2, sizeof(char)); // By default both \n and \r are accepted as newline, unless one of // them has also been specified as field delimiter if (tokenizer->delimiter == '\n') tokenizer->newline = '\r'; return tokenizer; } void delete_data(tokenizer_t *tokenizer) { // Don't free tokenizer->source because it points to part of // an already freed Python object int i; if (tokenizer->output_cols) { for (i = 0; i < tokenizer->num_cols; ++i) { free(tokenizer->output_cols[i]); } } free(tokenizer->output_cols); free(tokenizer->col_ptrs); free(tokenizer->output_len); // Set pointers to 0 so we don't use freed memory when reading over again tokenizer->output_cols = 0; tokenizer->col_ptrs = 0; tokenizer->output_len = 0; } void delete_tokenizer(tokenizer_t *tokenizer) { delete_data(tokenizer); free(tokenizer->comment_lines); free(tokenizer->buf); free(tokenizer); } void resize_col(tokenizer_t *self, int index) { // Temporarily store the position in output_cols[index] to // which col_ptrs[index] points long diff = self->col_ptrs[index] - self->output_cols[index]; // Double the size of the column string self->output_cols[index] = (char *) realloc(self->output_cols[index], 2 * self->output_len[index] * sizeof(char)); // Set the second (newly allocated) half of the column string to all zeros memset(self->output_cols[index] + self->output_len[index] * sizeof(char), 0, self->output_len[index] * sizeof(char)); self->output_len[index] *= 2; // realloc() might move the address in memory, so we have to move // col_ptrs[index] to an offset of the new address self->col_ptrs[index] = self->output_cols[index] + diff; } void resize_comments(tokenizer_t *self) { // Double the size of the comments string self->comment_lines = (char *) realloc(self->comment_lines, self->comment_pos + 1); // Set the second (newly allocated) half of the column string to all zeros memset(self->comment_lines + self->comment_lines_len * sizeof(char), 0, (self->comment_pos + 1 - self->comment_lines_len) * sizeof(char)); self->comment_lines_len = self->comment_pos + 1; } /* Resize the column string if necessary and then append c to the end of the column string, incrementing the column position pointer. */ static inline void push(tokenizer_t *self, char c, int col) { if (self->col_ptrs[col] - self->output_cols[col] >= self->output_len[col]) { resize_col(self, col); } *self->col_ptrs[col]++ = c; } /* Resize the comment string if necessary and then append c to the end of the comment string. */ static inline void push_comment(tokenizer_t *self, char c) { if (self->comment_pos >= self->comment_lines_len) { resize_comments(self); } self->comment_lines[self->comment_pos++] = c; } static inline void end_comment(tokenizer_t *self) { // Signal empty comment by inserting \x01 if (self->comment_pos == 0 || self->comment_lines[self->comment_pos - 1] == '\x00') { push_comment(self, '\x01'); } push_comment(self, '\x00'); } #define PUSH(c) push(self, c, col) /* Set the state to START_FIELD and begin with the assumption that the field is entirely whitespace in order to handle the possibility that the comment character is found before any non-whitespace even if whitespace stripping is disabled. */ #define BEGIN_FIELD() \ self->state = START_FIELD; \ whitespace = 1 /* First, backtrack to eliminate trailing whitespace if strip_whitespace_fields is true. If the field is empty, push '\x01' as a marker. Append a null byte to the end of the column string as a field delimiting marker. Increment the variable col if we are tokenizing data. */ static inline void end_field(tokenizer_t *self, int *col, int header) { if (self->strip_whitespace_fields && self->col_ptrs[*col] != self->output_cols[*col]) { --self->col_ptrs[*col]; while (*self->col_ptrs[*col] == ' ' || *self->col_ptrs[*col] == '\t') { *self->col_ptrs[*col]-- = '\x00'; } ++self->col_ptrs[*col]; } if (self->col_ptrs[*col] == self->output_cols[*col] || self->col_ptrs[*col][-1] == '\x00') { push(self, '\x01', *col); } push(self, '\x00', *col); if (!header) { ++*col; } } #define END_FIELD() end_field(self, &col, header) // Set the error code to c for later retrieval and return c #define RETURN(c) \ do { \ self->code = c; \ return c; \ } while (0) /* If we are tokenizing the header, end after the first line. Handle the possibility of insufficient columns appropriately; if fill_extra_cols=1, then append empty fields, but otherwise return an error. Increment our row count and possibly end if all the necessary rows have already been parsed. */ static inline int end_line(tokenizer_t *self, int col, int header, int end, tokenizer_state *old_state) { if (header) { ++self->source_pos; RETURN(NO_ERROR); } else if (self->fill_extra_cols) { while (col < self->num_cols) { PUSH('\x01'); END_FIELD(); } } else if (col < self->num_cols) { RETURN(NOT_ENOUGH_COLS); } ++self->num_rows; *old_state = START_LINE; if (end != -1 && self->num_rows == end) { ++self->source_pos; RETURN(NO_ERROR); } return -1; } #define END_LINE() if (end_line(self, col, header, end, &old_state) != -1) return self->code int skip_lines(tokenizer_t *self, int offset, int header) { int signif_chars = 0; int comment = 0; int i = 0; char c; while (i < offset) { if (self->source_pos >= self->source_len) { if (header) RETURN(INVALID_LINE); // header line is required else RETURN(NO_ERROR); // no data in input } c = self->source[self->source_pos]; if ((c == '\r' || c == '\n') && c != self->delimiter) { if (c == '\r' && self->source_pos < self->source_len - 1 && self->source[self->source_pos + 1] == '\n') { ++self->source_pos; // skip \n in \r\n } if (!comment && signif_chars > 0) ++i; else if (comment && !header) end_comment(self); // Start by assuming a line is empty and non-commented signif_chars = 0; comment = 0; } else if ((c != ' ' && c != '\t') || !self->strip_whitespace_lines) { // Comment line if (!signif_chars && self->comment != 0 && c == self->comment) comment = 1; else if (comment && !header) push_comment(self, c); // Significant character encountered ++signif_chars; } else if (comment && !header) { push_comment(self, c); } ++self->source_pos; } RETURN(NO_ERROR); } int tokenize(tokenizer_t *self, int end, int header, int num_cols) { char c; // Input character int col = 0; // Current column ignoring possibly excluded columns tokenizer_state old_state = START_LINE; // Last state the tokenizer was in before CR mode int i = 0; int whitespace = 1; delete_data(self); // Clear old reading data self->num_rows = 0; self->comment_lines_len = INITIAL_COMMENT_LEN; if (header) self->num_cols = 1; // Store header output in one column else self->num_cols = num_cols; // Allocate memory for structures used during tokenization self->output_cols = (char **) malloc(self->num_cols * sizeof(char *)); self->col_ptrs = (char **) malloc(self->num_cols * sizeof(char *)); self->output_len = (size_t *) malloc(self->num_cols * sizeof(size_t)); for (i = 0; i < self->num_cols; ++i) { self->output_cols[i] = (char *) calloc(1, INITIAL_COL_SIZE * sizeof(char)); // Make each col_ptrs pointer point to the beginning of the // column string self->col_ptrs[i] = self->output_cols[i]; self->output_len[i] = INITIAL_COL_SIZE; } if (end == 0) RETURN(NO_ERROR); // Don't read if end == 0 self->state = START_LINE; // Loop until all of self->source has been read while (self->source_pos < self->source_len + 1) { if (self->source_pos == self->source_len) c = self->newline; else c = self->source[self->source_pos]; if (c == '\r' && c != self->delimiter && c != self->newline) c = '\n'; switch (self->state) { case START_LINE: if (c == self->newline) break; else if ((c == ' ' || c == '\t') && self->strip_whitespace_lines) break; else if (self->comment != 0 && c == self->comment) { // Comment line; ignore self->state = COMMENT; break; } // Initialize variables for the beginning of line parsing col = 0; BEGIN_FIELD(); // Parse in mode START_FIELD case START_FIELD: // Strip whitespace before field begins if ((c == ' ' || c == '\t') && self->strip_whitespace_fields) break; else if (!self->strip_whitespace_lines && self->comment != 0 && c == self->comment) { // Comment line, not caught earlier because of no stripping self->state = COMMENT; break; } // Handle newline characters first else if (c == self->newline) { if (self->strip_whitespace_lines) { // Move on if the delimiter is whitespace, e.g. // '1 2 3 '->['1','2','3'] if (self->delimiter == ' ' || self->delimiter == '\t') ; // Register an empty field if non-whitespace delimiter, // e.g. '1,2, '->['1','2',''] else { if (col >= self->num_cols) RETURN(TOO_MANY_COLS); END_FIELD(); } } else if (!self->strip_whitespace_lines) { // In this case we don't want to left-strip the field, // so we backtrack size_t tmp = self->source_pos; --self->source_pos; while (self->source_pos >= 0 && self->source[self->source_pos] != self->delimiter && self->source[self->source_pos] != '\n' && self->source[self->source_pos] != '\r') { --self->source_pos; } // Backtracked to line beginning if (self->source_pos == -1 || self->source[self->source_pos] == '\n' || self->source[self->source_pos] == '\r') { self->source_pos = tmp; } else { ++self->source_pos; if (self->source_pos == tmp) // No whitespace, just an empty field ; else while (self->source_pos < tmp) { // Append whitespace characters PUSH(self->source[self->source_pos]); ++self->source_pos; } if (col >= self->num_cols) RETURN(TOO_MANY_COLS); END_FIELD(); // Whitespace counts as a field } } END_LINE(); self->state = START_LINE; break; } // Before proceeding with a new field check column does not exceed // number defined in header or from auto-detect to avoid segfaults // such as https://github.com/astropy/astropy/issues/9922 else if (col >= self->num_cols) RETURN(TOO_MANY_COLS); else if (c == self->delimiter) // Field ends before it begins { END_FIELD(); BEGIN_FIELD(); break; } else if (c == self->quotechar) // Start parsing quoted field { self->state = START_QUOTED_FIELD; break; } else // Valid field character, parse again in FIELD mode self->state = FIELD; case FIELD: if (self->comment != 0 && c == self->comment && whitespace && col == 0) // No whitespace stripping, but the comment char is found // before any data, e.g. ' # a b c' self->state = COMMENT; else if (c == self->delimiter && self->source_pos < self->source_len) { // End of field, look for new field END_FIELD(); BEGIN_FIELD(); } else if (c == self->newline) { // Line ending, stop parsing both field and line END_FIELD(); END_LINE(); self->state = START_LINE; } else { if (c != ' ' && c != '\t') whitespace = 0; // Field is not all whitespace PUSH(c); } break; case START_QUOTED_FIELD: if ((c == ' ' || c == '\t') && self->strip_whitespace_fields) { // Ignore initial whitespace break; } else if (c == self->quotechar) { // Lookahead check for double quote inside quoted field, // e.g. """cd" => "cd if (self->source_pos < self->source_len - 1) { if (self->source[self->source_pos + 1] == self->quotechar) { self->state = QUOTED_FIELD_DOUBLE_QUOTE; PUSH(c); break; } } // Parse rest of field normally, e.g. ""c self->state = FIELD; } else { // Valid field character, parse again in QUOTED_FIELD mode self->state = QUOTED_FIELD; } case QUOTED_FIELD: if (c == self->quotechar) { // Lookahead check for double quote inside quoted field, // e.g. "ab""cd" => ab"cd if (self->source_pos < self->source_len - 1) { if (self->source[self->source_pos + 1] == self->quotechar) { self->state = QUOTED_FIELD_DOUBLE_QUOTE; PUSH(c); break; } } // Parse rest of field normally, e.g. "ab"c self->state = FIELD; } else { PUSH(c); } break; case QUOTED_FIELD_DOUBLE_QUOTE: // Ignore the second double quote from "ab""cd" and parse rest of // field normally as quoted field. self->state = QUOTED_FIELD; break; case COMMENT: if (c == self->newline) { self->state = START_LINE; if (!header) end_comment(self); } else if (!header) push_comment(self, c); break; // Keep looping until we find a newline } ++self->source_pos; } RETURN(0); } static int ascii_strncasecmp(const char *str1, const char *str2, size_t n) { int char1, char2; do { char1 = tolower(*(str1++)); char2 = tolower(*(str2++)); n--; } while (n && char1 != '\0' && char1 == char2); return (char1 - char2); } static inline int64_t strtoi64(const char *nptr, char **endptr, int base) { // Adapted from: https://stackoverflow.com/a/66046867 errno = 0; long long v = strtoll(nptr, endptr, base); #if LLONG_MIN < INT64_MIN || LLONG_MAX > INT64_MAX if (v < INT64_MIN) { v = INT64_MIN; errno = ERANGE; } else if (v > INT64_MAX) { v = INT64_MAX; errno = ERANGE; } #endif return (int64_t) v; } int64_t str_to_int64_t(tokenizer_t *self, char *str) { char *tmp; int64_t ret; errno = 0; ret = strtoi64(str, &tmp, 10); if (tmp == str || *tmp != '\0') self->code = CONVERSION_ERROR; else if (errno == ERANGE) self->code = OVERFLOW_ERROR; return ret; } double str_to_double(tokenizer_t *self, char *str) { char *tmp; double val; errno = 0; if (self->use_fast_converter) { val = xstrtod(str, &tmp, '.', self->expchar, ',', 1); if (errno == EINVAL || tmp == str || *tmp != '\0') { goto conversion_error; } else if (errno == ERANGE) { self->code = OVERFLOW_ERROR; } else if (errno == EDOM) // xstrtod signalling invalid exponents { self->code = CONVERSION_ERROR; } return val; } else { val = strtod(str, &tmp); if (errno == EINVAL || tmp == str || *tmp != '\0') { goto conversion_error; } else if (errno == ERANGE) { self->code = OVERFLOW_ERROR; } else if (errno == EDOM) { self->code = CONVERSION_ERROR; } return val; } conversion_error: // Handle inf and nan values for xstrtod and platforms whose strtod // doesn't support this val = 1.0; tmp = str; if (*tmp == '+') { tmp++; } else if (*tmp == '-') { tmp++; val = -1.0; } if (0 == ascii_strncasecmp(tmp, "nan", 3)) { // Handle optional nan type specifier; this is ignored tmp += 3; val = NAN; } else if (0 == ascii_strncasecmp(tmp, "inf", 3)) { tmp += 3; if (0 == ascii_strncasecmp(tmp, "inity", 5)) { tmp += 5; } val *= INFINITY; } else { // Original (tmp == str || *tmp != '\0') case, no NaN or inf found self->code = CONVERSION_ERROR; val = 0; } return val; } // --------------------------------------------------------------------------- // Implementation of xstrtod // // strtod.c // // Convert string to double // // Copyright (C) 2002 Michael Ringgaard. All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions // are met: // // 1. Redistributions of source code must retain the above copyright // notice, this list of conditions and the following disclaimer. // 2. Redistributions in binary form must reproduce the above copyright // notice, this list of conditions and the following disclaimer in the // documentation and/or other materials provided with the distribution. // 3. Neither the name of the project nor the names of its contributors // may be used to endorse or promote products derived from this software // without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND // ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS // OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) // HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF // SUCH DAMAGE. // // ----------------------------------------------------------------------- // Modifications by Warren Weckesser, March 2011: // * Rename strtod() to xstrtod(). // * Added decimal and sci arguments. // * Skip trailing spaces. // * Commented out the other functions. // Modifications by Richard T Guy, August 2013: // * Add tsep argument for thousands separator // Modifications by Michael Mueller, August 2014: // * Cache powers of 10 in memory to avoid rounding errors // * Stop parsing decimals after 17 significant figures // Modifications by Derek Homeier, August 2015: // * Recognise alternative exponent characters passed in 'sci'; try automatic // detection of allowed Fortran formats with sci='A' // * Require exactly 3 digits in exponent for Fortran-type format '8.7654+321' // Modifications by Derek Homeier, September-December 2016: // * Fixed some corner cases of very large or small exponents; proper return // * do not increment num_digits until nonzero digit read in // double xstrtod(const char *str, char **endptr, char decimal, char expchar, char tsep, int skip_trailing) { double number; int exponent; int negative; char *p = (char *) str; char exp; char sci; int num_digits; int num_decimals; int max_digits = 17; int num_exp = 3; int non_zero; int n; // Cache powers of 10 in memory static double e[] = {1., 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15, 1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 1e23, 1e24, 1e25, 1e26, 1e27, 1e28, 1e29, 1e30, 1e31, 1e32, 1e33, 1e34, 1e35, 1e36, 1e37, 1e38, 1e39, 1e40, 1e41, 1e42, 1e43, 1e44, 1e45, 1e46, 1e47, 1e48, 1e49, 1e50, 1e51, 1e52, 1e53, 1e54, 1e55, 1e56, 1e57, 1e58, 1e59, 1e60, 1e61, 1e62, 1e63, 1e64, 1e65, 1e66, 1e67, 1e68, 1e69, 1e70, 1e71, 1e72, 1e73, 1e74, 1e75, 1e76, 1e77, 1e78, 1e79, 1e80, 1e81, 1e82, 1e83, 1e84, 1e85, 1e86, 1e87, 1e88, 1e89, 1e90, 1e91, 1e92, 1e93, 1e94, 1e95, 1e96, 1e97, 1e98, 1e99, 1e100, 1e101, 1e102, 1e103, 1e104, 1e105, 1e106, 1e107, 1e108, 1e109, 1e110, 1e111, 1e112, 1e113, 1e114, 1e115, 1e116, 1e117, 1e118, 1e119, 1e120, 1e121, 1e122, 1e123, 1e124, 1e125, 1e126, 1e127, 1e128, 1e129, 1e130, 1e131, 1e132, 1e133, 1e134, 1e135, 1e136, 1e137, 1e138, 1e139, 1e140, 1e141, 1e142, 1e143, 1e144, 1e145, 1e146, 1e147, 1e148, 1e149, 1e150, 1e151, 1e152, 1e153, 1e154, 1e155, 1e156, 1e157, 1e158, 1e159, 1e160, 1e161, 1e162, 1e163, 1e164, 1e165, 1e166, 1e167, 1e168, 1e169, 1e170, 1e171, 1e172, 1e173, 1e174, 1e175, 1e176, 1e177, 1e178, 1e179, 1e180, 1e181, 1e182, 1e183, 1e184, 1e185, 1e186, 1e187, 1e188, 1e189, 1e190, 1e191, 1e192, 1e193, 1e194, 1e195, 1e196, 1e197, 1e198, 1e199, 1e200, 1e201, 1e202, 1e203, 1e204, 1e205, 1e206, 1e207, 1e208, 1e209, 1e210, 1e211, 1e212, 1e213, 1e214, 1e215, 1e216, 1e217, 1e218, 1e219, 1e220, 1e221, 1e222, 1e223, 1e224, 1e225, 1e226, 1e227, 1e228, 1e229, 1e230, 1e231, 1e232, 1e233, 1e234, 1e235, 1e236, 1e237, 1e238, 1e239, 1e240, 1e241, 1e242, 1e243, 1e244, 1e245, 1e246, 1e247, 1e248, 1e249, 1e250, 1e251, 1e252, 1e253, 1e254, 1e255, 1e256, 1e257, 1e258, 1e259, 1e260, 1e261, 1e262, 1e263, 1e264, 1e265, 1e266, 1e267, 1e268, 1e269, 1e270, 1e271, 1e272, 1e273, 1e274, 1e275, 1e276, 1e277, 1e278, 1e279, 1e280, 1e281, 1e282, 1e283, 1e284, 1e285, 1e286, 1e287, 1e288, 1e289, 1e290, 1e291, 1e292, 1e293, 1e294, 1e295, 1e296, 1e297, 1e298, 1e299, 1e300, 1e301, 1e302, 1e303, 1e304, 1e305, 1e306, 1e307, 1e308}; // Cache additional negative powers of 10 /* static double m[] = {1e-309, 1e-310, 1e-311, 1e-312, 1e-313, 1e-314, 1e-315, 1e-316, 1e-317, 1e-318, 1e-319, 1e-320, 1e-321, 1e-322, 1e-323}; */ errno = 0; // Skip leading whitespace while (isspace(*p)) p++; // Handle optional sign negative = 0; switch (*p) { case '-': negative = 1; // Fall through to increment position case '+': p++; } // No numerical value following sign - make no conversion and return zero, // resetting endptr to beginning of str (consistent with strtod behaviour) // E.g. -1.e0 and -.0e1 are valid, -.e0 is not! if (!(isdigit(*p) || (*p == decimal && isdigit(*(p + 1))))) { if (endptr) *endptr = (char *) str; return 0e0; } number = 0.; exponent = 0; num_digits = 0; num_decimals = 0; non_zero = 0; // Process string of digits while (isdigit(*p)) { if (num_digits < max_digits) { number = number * 10. + (*p - '0'); non_zero += (*p != '0'); if(non_zero) num_digits++; } else ++exponent; p++; p += (tsep != '\0' && *p == tsep); } // Process decimal part if (*p == decimal) { p++; while (num_digits < max_digits && isdigit(*p)) { number = number * 10. + (*p - '0'); non_zero += (*p != '0'); if(non_zero) num_digits++; num_decimals++; p++; } if (num_digits >= max_digits) // consume extra decimal digits while (isdigit(*p)) ++p; exponent -= num_decimals; } // Exactly 0 - no precision loss/OverflowError if (num_digits == 0) number = 0.0; // Correct for sign if (negative) number = -number; // Process an exponent string sci = toupper(expchar); if (sci == 'A') { // check for possible Fortran exponential notations, including // triple-digits with no character exp = toupper(*p); if (exp == 'E' || exp == 'D' || exp == 'Q' || *p == '+' || *p == '-') { // Handle optional sign negative = 0; switch (exp) { case '-': negative = 1; // Fall through to increment pos case '+': p++; break; case 'E': case 'D': case 'Q': switch (*++p) { case '-': negative = 1; // Fall through to increment pos case '+': p++; } } // Process string of digits n = 0; while (isdigit(*p)) { n = n * 10 + (*p - '0'); num_exp--; p++; } // Trigger error if not exactly three digits if (num_exp != 0 && (exp == '+' || exp == '-')) { errno = EDOM; number = 0.0; } if (negative) exponent -= n; else exponent += n; } } else if (toupper(*p) == sci) { // Handle optional sign negative = 0; switch (*++p) { case '-': negative = 1; // Fall through to increment pos case '+': p++; } // Process string of digits n = 0; while (isdigit(*p)) { n = n * 10 + (*p - '0'); p++; } if (negative) exponent -= n; else exponent += n; } // largest representable float64 is 1.7977e+308, closest to 0 ~4.94e-324, // but multiplying exponents in in two steps gives slightly better precision if (number != 0.0) { if (exponent > 305) { if (exponent > 308) // leading zeros already subtracted from exp number *= HUGE_VAL; else { number *= e[exponent-300]; number *= 1.e300; } } else if (exponent < -308) // subnormal { if (exponent < -616) // prevent invalid array access number = 0.; else { number /= e[-308-exponent]; number *= 1.e-308; } // trigger warning if resolution is > ~1.e-15; // strtod does so for |number| <~ 2.25e-308 // if (number > -4.94e-309 && number < 4.94e-309) errno = ERANGE; } else if (exponent > 0) number *= e[exponent]; else if (exponent < 0) number /= e[-exponent]; if (number >= HUGE_VAL || number <= -HUGE_VAL) errno = ERANGE; } if (skip_trailing) { // Skip trailing whitespace while (isspace(*p)) p++; } if (endptr) *endptr = p; return number; } void start_iteration(tokenizer_t *self, int col) { // Begin looping over the column string with index col self->iter_col = col; // Start at the initial pointer position self->curr_pos = self->output_cols[col]; } char *next_field(tokenizer_t *self, int *size) { char *tmp = self->curr_pos; // pass through the entire field until reaching the delimiter while (*self->curr_pos != '\x00') ++self->curr_pos; ++self->curr_pos; // next field begins after the delimiter if (*tmp == '\x01') // empty field; this is a hack { if (size) *size = 0; return self->buf; } else { if (size) *size = self->curr_pos - tmp - 1; return tmp; } } char *get_line(char *ptr, size_t *len, size_t map_len) { size_t pos = 0; while (pos < map_len) { if (ptr[pos] == '\r') { *len = pos; // Windows line break (\r\n) if (pos != map_len - 1 && ptr[pos + 1] == '\n') return ptr + pos + 2; // skip newline character else // Carriage return line break return ptr + pos + 1; } else if (ptr[pos] == '\n') { *len = pos; return ptr + pos + 1; } ++pos; } // done with input return 0; } void reset_comments(tokenizer_t *self) { free(self->comment_lines); self->comment_pos = 0; self->comment_lines_len = INITIAL_COMMENT_LEN; self->comment_lines = (char *) malloc(INITIAL_COMMENT_LEN); }