input/csv: add support for timestamp columns, auto detect samplerate

[libsigrok.git] / src / input / csv.c
diff --git a/src/input/csv.c b/src/input/csv.c

index b7d87717ae4c19188de8eab4a948df8137c7682a..10ddd6865a98f31dbdb2b48079420f7be41a9747 100644 (file)
--- a/src/input/csv.c
+++ b/src/input/csv.c
@@ -41,9 +41,14 @@
   *     "all remaining columns", only applicable to the last field), a format
   *     specifying character ('x' hexadecimal, 'o' octal, 'b' binary, 'l'
   *     single-bit logic), and an optional bit count (translating to: logic
- *     channels communicated in that column). This "column_formats" option
- *     is most versatile, other forms of specifying the column layout only
- *     exist for backwards compatibility.
+ *     channels communicated in that column). The 'a' format marks analog
+ *     data, an optionally following number is the digits count (resolution).
+ *     The 't' format marks timestamp values, which could help in automatic
+ *     determination of the input stream's samplerate. This "column_formats"
+ *     option is most versatile, other forms of specifying the column layout
+ *     only exist for backwards compatibility, and are rather limited. They
+ *     exclusively support logic input data in strictly adjacent columns,
+ *     with further constraints on column layout for multi-bit data.
   *
   * single_column: Specifies the column number which contains the logic data
   *     for single-column mode. All logic data is taken from several bits
@@ -109,15 +114,36 @@
   * - ... -I csv:start_line=20:header=yes:...
   *   Skip the first 19 text lines. Use line 20 to derive channel names.
   *   Data starts at line 21.
+ * - ... -I csv:column_formats=*a6 ...
+ *   Each column contains an analog value with six significant digits
+ *   after the decimal period.
+ * - ... -I csv:column_formats=t,2a ...
+ *   The first column contains timestamps, the next two columns contain
+ *   analog values. The capture's samplerate could get determined from
+ *   the timestamp values if not provided by the user by means of the
+ *   'samplerate' option. This assumes a mere number in units of seconds,
+ *   and equidistant rows, there is no fancy support for textual unit
+ *   suffixes nor gaps in the stream of samples nor other non-linearity,
+ *   just '-' ignore the column if the format is not supported).
   */
  
  /*
   * TODO
   *
- * - Extend support for analog input data? (optional)
+ * - Extend support for analog input data.
   *   - Determine why analog samples of 'double' data type get scrambled
   *     in sigrok-cli screen output. Is analog.encoding->unitsize not
   *     handled properly? A sigrok-cli or libsigrok (src/output) issue?
+ *   - Reconsider the channel creation after format processing. Current
+ *     logic may "bleed" channel names into the analog group when logic
+ *     channels' columns follow analog columns (seen with "-,2a,x8").
+ *     Trying to sort it out, a naive change used to map logic channels'
+ *     data to incorrect bitmap positions. The whole channel numbering
+ *     needs reconsideration. Probably it's easiest to first create _all_
+ *     logic channels so that they have adjacent numbers starting at 0
+ *     (addressing logic bits), then all analog channels (again adjacent)
+ *     to simplify the calculation of their index in the sample set as
+ *     well as their sdi channel index from the "analog column index".
   * - Optionally get sample rate from timestamp column. Just best-effort
   *   approach, not necessarily reliable. Users can always specify rates.
   * - Add a test suite for input modules in general, and CSV in specific?
@@ -136,6 +162,7 @@ enum single_col_format {
         FORMAT_HEX,     /* Hex digits for a set of bits. */
         FORMAT_OCT,     /* Oct digits for a set of bits. */
         FORMAT_ANALOG,  /* Floating point number for an analog channel. */
+       FORMAT_TIME,    /* Timestamps. */
  };
  
  static const char *col_format_text[] = {
@@ -144,6 +171,7 @@ static const char *col_format_text[] = {
         [FORMAT_HEX] = "hexadecimal",
         [FORMAT_OCT] = "octal",
         [FORMAT_ANALOG] = "analog",
+       [FORMAT_TIME] = "timestamp",
  };
  
  static const char col_format_char[] = {
@@ -152,21 +180,44 @@ static const char col_format_char[] = {
         [FORMAT_HEX] = 'x',
         [FORMAT_OCT] = 'o',
         [FORMAT_ANALOG] = 'a',
+       [FORMAT_TIME] = 't',
  };
  
+static gboolean format_is_ignore(enum single_col_format fmt)
+{
+       return fmt == FORMAT_NONE;
+}
+
+static gboolean format_is_logic(enum single_col_format fmt)
+{
+       return fmt >= FORMAT_BIN && fmt <= FORMAT_OCT;
+}
+
+static gboolean format_is_analog(enum single_col_format fmt)
+{
+       return fmt == FORMAT_ANALOG;
+}
+
+static gboolean format_is_timestamp(enum single_col_format fmt)
+{
+       return fmt == FORMAT_TIME;
+}
+
  struct column_details {
         size_t col_nr;
         enum single_col_format text_format;
         size_t channel_offset;
         size_t channel_count;
+       size_t channel_index;
         int analog_digits;
  };
  
  struct context {
         gboolean started;
  
-       /* Current selected samplerate. */
+       /* Current samplerate, optionally determined from input data. */
         uint64_t samplerate;
+       double prev_timestamp;
         gboolean samplerate_sent;
  
         /* Number of channels. */
@@ -466,6 +517,9 @@ static int split_column_format(const char *spec,
         case 'a':
                 format_code = FORMAT_ANALOG;
                 break;
+       case 't':
+               format_code = FORMAT_TIME;
+               break;
         default:        /* includes NUL */
                 return SR_ERR_ARG;
         }
@@ -478,8 +532,8 @@ static int split_column_format(const char *spec,
         if (!endp)
                 return SR_ERR_ARG;
         if (endp == spec)
-               count = (format_code == FORMAT_ANALOG) ? 3 : 1;
-       if (!format_code)
+               count = format_is_analog(format_code) ? 3 : 1;
+       if (format_is_ignore(format_code))
                 count = 0;
         if (format_char == 'l')
                 count = 1;
@@ -548,9 +602,9 @@ static int make_column_details_from_format(const struct sr_input *in,
                         c = auto_column_count;
                 }
                 column_count += c;
-               if (f == FORMAT_ANALOG)
+               if (format_is_analog(f))
                         analog_count += c;
-               else if (f)
+               else if (format_is_logic(f))
                         logic_count += c * b;
         }
         sr_dbg("Column format %s -> %zu columns, %zu logic, %zu analog channels.",
@@ -578,23 +632,27 @@ static int make_column_details_from_format(const struct sr_input *in,
                         detail = &inc->column_details[column_idx++];
                         detail->col_nr = column_idx;
                         detail->text_format = f;
-                       if (detail->text_format == FORMAT_ANALOG) {
+                       if (format_is_analog(detail->text_format)) {
                                 detail->channel_offset = analog_idx;
                                 detail->channel_count = 1;
                                 detail->analog_digits = b;
                                 analog_idx += detail->channel_count;
-                       } else if (detail->text_format) {
+                       } else if (format_is_logic(detail->text_format)) {
                                 detail->channel_offset = channel_idx;
                                 detail->channel_count = b;
                                 channel_idx += detail->channel_count;
-                       }
-                       sr_dbg("detail -> col %zu, fmt %s, ch off/cnt %zu/%zu",
-                               detail->col_nr, col_format_text[detail->text_format],
-                               detail->channel_offset, detail->channel_count);
-                       if (!detail->text_format)
+                       } else if (format_is_ignore(detail->text_format)) {
+                               /* EMPTY */
                                 continue;
+                       } else {
+                               /*
+                                * Neither logic nor analog data, nor ignore.
+                                * Format was noted. No channel creation involved.
+                                */
+                               continue;
+                       }
                         /*
-                        * Create channels with appropriate names. Optionally
+                        * Pick most appropriate channel names. Optionally
                          * use text from a header line (when requested by the
                          * user). In the absence of header text, channels are
                          * assigned rather generic names.
@@ -610,6 +668,13 @@ static int make_column_details_from_format(const struct sr_input *in,
                                 caption = NULL;
                         if (!caption || !*caption)
                                 caption = NULL;
+                       /*
+                        * TODO Need we first create _all_ logic channels,
+                        * before creating analog channels? Just store the
+                        * parameters here (index, type, name) and have the
+                        * creation sequence done outside of the format
+                        * spec parse loop.
+                        */
                         for (create_idx = 0; create_idx < detail->channel_count; create_idx++) {
                                 if (caption && detail->channel_count == 1) {
                                         g_string_assign(channel_name, caption);
@@ -620,12 +685,15 @@ static int make_column_details_from_format(const struct sr_input *in,
                                         g_string_printf(channel_name, "%zu",
                                                 detail->channel_offset + create_idx);
                                 }
-                               if (detail->text_format == FORMAT_ANALOG) {
+                               if (format_is_analog(detail->text_format)) {
                                         channel_sdi_nr = logic_count + detail->channel_offset + create_idx;
                                         channel_type = SR_CHANNEL_ANALOG;
-                               } else {
+                                       detail->channel_index = g_slist_length(in->sdi->channels);
+                               } else if (format_is_logic(detail->text_format)) {
                                         channel_sdi_nr = detail->channel_offset + create_idx;
                                         channel_type = SR_CHANNEL_LOGIC;
+                               } else {
+                                       continue;
                                 }
                                 sr_channel_new(in->sdi, channel_sdi_nr,
                                         channel_type, TRUE, channel_name->str);
@@ -776,8 +844,7 @@ static int parse_logic(const char *column, struct context *inc,
                         ch_rem--;
                         set_logic_level(inc, ch_idx + 0, bits & (1 << 0));
                         break;
-               case FORMAT_ANALOG:
-               case FORMAT_NONE:
+               default:
                         /* ShouldNotHappen(TM), but silences compiler warning. */
                         return SR_ERR;
                 }
@@ -815,7 +882,7 @@ static int parse_analog(const char *column, struct context *inc,
         csv_analog_t value;
         int ret;
  
-       if (details->text_format != FORMAT_ANALOG)
+       if (!format_is_analog(details->text_format))
                 return SR_ERR_BUG;
  
         length = strlen(column);
@@ -843,6 +910,95 @@ static int parse_analog(const char *column, struct context *inc,
         return SR_OK;
  }
  
+/**
+ * @brief Parse a timestamp text, auto-determine samplerate.
+ *
+ * @param[in] column   The input text, a floating point number.
+ * @param[in] inc      The input module's context.
+ * @param[in] details  The column processing details.
+ *
+ * @retval SR_OK       Success.
+ * @retval SR_ERR      Invalid input data (empty, or format error).
+ *
+ * This routine attempts to automatically determine the input data's
+ * samplerate from text rows' timestamp values. Only simple formats are
+ * supported, user provided values always take precedence.
+ */
+static int parse_timestamp(const char *column, struct context *inc,
+       const struct column_details *details)
+{
+       double ts, rate;
+       int ret;
+
+       if (!format_is_timestamp(details->text_format))
+               return SR_ERR_BUG;
+
+       /*
+        * Implementor's notes on timestamp interpretation. Use a simple
+        * approach for improved maintainability which covers most cases
+        * of input data. There is not much gain in adding complexity,
+        * users can easily provide the rate when auto-detection fails.
+        * - Bail out if samplerate is known already.
+        * - Try to interpret the timestamp (simple float conversion).
+        *   If conversion fails then clear all previous knowledge and
+        *   bail out (non-fatal, perhaps warn). Silently ignore values
+        *   of zero since those could be silent fails -- assume that
+        *   genuine data contains at least two adjacent rows with useful
+        *   timestamps for the feature to work reliably. Annoying users
+        *   with "failed to detect" messages is acceptable here, since
+        *   users expecting the feature to work should provide useful
+        *   data, and there are easy ways to disable the detection or
+        *   ignore the column.
+        * - If there is no previous timestamp, keep the current value
+        *   for later reference and bail out.
+        * - If a previous timestamp was seen, determine the difference
+        *   between them, and derive the samplerate. Update internal
+        *   state (the value automatically gets sent to the datafeed),
+        *   and clear previous knowledge. Subsequent calls will ignore
+        *   following input data (see above, rate is known).
+        *
+        * TODO Potential future improvements:
+        * - Prefer rationals over floats for improved precision and
+        *   reduced rounding errors which result in odd rates.
+        * - Support other formats ("2 ms" or similar)?
+        */
+       if (inc->samplerate)
+               return SR_OK;
+       ret = sr_atod_ascii(column, &ts);
+       if (ret != SR_OK)
+               ts = 0.0;
+       if (!ts) {
+               sr_warn("Cannot convert timestamp text %s in line %zu (or zero value).",
+                       column, inc->line_number);
+               inc->prev_timestamp = 0.0;
+               return SR_OK;
+       }
+       if (!inc->prev_timestamp) {
+               sr_dbg("First timestamp value %g in line %zu.",
+                       ts, inc->line_number);
+               inc->prev_timestamp = ts;
+               return SR_OK;
+       }
+       sr_dbg("Second timestamp value %g in line %zu.", ts, inc->line_number);
+       ts -= inc->prev_timestamp;
+       sr_dbg("Timestamp difference %g in line %zu.",
+               ts, inc->line_number);
+       if (!ts) {
+               sr_warn("Zero timestamp difference in line %zu.",
+                       inc->line_number);
+               inc->prev_timestamp = ts;
+               return SR_OK;
+       }
+       rate = 1.0 / ts;
+       rate += 0.5;
+       rate = (uint64_t)rate;
+       sr_dbg("Rate from timestamp %g in line %zu.", rate, inc->line_number);
+       inc->samplerate = rate;
+       inc->prev_timestamp = 0.0;
+
+       return SR_OK;
+}
+
  /**
   * @brief Parse routine which ignores the input text.
   *
@@ -867,6 +1023,7 @@ static const col_parse_cb col_parse_funcs[] = {
         [FORMAT_OCT] = parse_logic,
         [FORMAT_HEX] = parse_logic,
         [FORMAT_ANALOG] = parse_analog,
+       [FORMAT_TIME] = parse_timestamp,
  };
  
  static int init(struct sr_input *in, GHashTable *options)
@@ -1026,7 +1183,7 @@ static int initial_parse(const struct sr_input *in, GString *buf)
  {
         struct context *inc;
         size_t num_columns;
-       size_t line_number, line_idx, ch_idx;
+       size_t line_number, line_idx;
         int ret;
         char **lines, *line, **columns;
  
@@ -1107,6 +1264,11 @@ static int initial_parse(const struct sr_input *in, GString *buf)
          * for datafeed submission that is a multiple of the unit size.
          * Allocate the larger buffer, the "sample buffer" will point
          * to a location within that large buffer later.
+        *
+        * TODO Move channel creation here, and just store required
+        * parameters in the format parser above? Could simplify the
+        * arrangement that logic and analog channels get created in
+        * strict sequence in their respective groups.
          */
         if (inc->logic_channels) {
                 inc->sample_unit_size = (inc->logic_channels + 7) / 8;
@@ -1124,7 +1286,9 @@ static int initial_parse(const struct sr_input *in, GString *buf)
         if (inc->analog_channels) {
                 size_t sample_size, sample_count;
                 size_t detail_idx;
+               struct column_details *detail;
                 int *digits_item;
+               void *channel;
                 sample_size = sizeof(inc->analog_datafeed_buffer[0]);
                 inc->analog_datafeed_buf_size = CHUNK_SIZE;
                 inc->analog_datafeed_buf_size /= sample_size;
@@ -1137,18 +1301,16 @@ static int initial_parse(const struct sr_input *in, GString *buf)
                         goto out;
                 }
                 inc->analog_datafeed_buf_fill = 0;
-               inc->analog_datafeed_channels = g_malloc0_n(inc->analog_channels, sizeof(inc->analog_datafeed_channels[0]));
-               for (ch_idx = 0; ch_idx < inc->analog_channels; ch_idx++) {
-                       void *channel;
-                       channel = g_slist_nth_data(in->sdi->channels, inc->logic_channels + ch_idx);
-                       inc->analog_datafeed_channels[ch_idx] = g_slist_append(NULL, channel);
-               }
+               inc->analog_datafeed_channels = g_malloc0(inc->analog_channels * sizeof(inc->analog_datafeed_channels[0]));
                 inc->analog_datafeed_digits = g_malloc0(inc->analog_channels * sizeof(inc->analog_datafeed_digits[0]));
                 digits_item = inc->analog_datafeed_digits;
                 for (detail_idx = 0; detail_idx < inc->column_want_count; detail_idx++) {
-                       if (inc->column_details[detail_idx].text_format != FORMAT_ANALOG)
+                       detail = &inc->column_details[detail_idx];
+                       if (!format_is_analog(detail->text_format))
                                 continue;
-                       *digits_item++ = inc->column_details[detail_idx].analog_digits;
+                       channel = g_slist_nth_data(in->sdi->channels, detail->channel_index);
+                       inc->analog_datafeed_channels[detail->channel_offset] = g_slist_append(NULL, channel);
+                       *digits_item++ = detail->analog_digits;
                 }
         }
  
@@ -1435,17 +1597,17 @@ enum option_index {
  static struct sr_option options[] = {
         [OPT_COL_FMTS] = {
                 "column_formats", "Column format specs",
-               "Specifies text columns data types: comma separated list of [<cols>]<fmt>[<bits>], with -/x/o/b/l format specifiers.",
+               "Specifies text columns data types: A comma separated list of [<cols>]<fmt>[<bits>] items, with - to ignore columns, x/o/b/l for logic data, a (and resolution) for analog data, t for timestamps.",
                 NULL, NULL,
         },
         [OPT_SINGLE_COL] = {
                 "single_column", "Single column",
-               "Enable single-column mode, exclusively use text from the specified column (number starting at 1).",
+               "Enable single-column mode, exclusively use text from the specified column (number starting at 1). Obsoleted by 'column_formats'.",
                 NULL, NULL,
         },
         [OPT_FIRST_COL] = {
                 "first_column", "First column",
-               "Number of the first column with logic data in simple multi-column mode (number starting at 1, default 1).",
+               "Number of the first column with logic data in simple multi-column mode (number starting at 1, default 1). Obsoleted by 'column_formats'.",
                 NULL, NULL,
         },
         [OPT_NUM_LOGIC] = {
@@ -1455,7 +1617,7 @@ static struct sr_option options[] = {
         },
         [OPT_FORMAT] = {
                 "single_format", "Data format for simple single-column mode.",
-               "The number format of single-column mode input data: bin, hex, oct.",
+               "The number format of single-column mode input data: bin, hex, oct. Obsoleted by 'column_formats'.",
                 NULL, NULL,
         },
         [OPT_START] = {
@@ -1465,12 +1627,12 @@ static struct sr_option options[] = {
         },
         [OPT_HEADER] = {
                 "header", "Get channel names from first line.",
-               "Use the first processed line's column captions (when available) as channel names.",
+               "Use the first processed line's column captions (when available) as channel names. Off by default",
                 NULL, NULL,
         },
         [OPT_RATE] = {
                 "samplerate", "Samplerate (Hz)",
-               "The input data's sample rate in Hz.",
+               "The input data's sample rate in Hz. No default value.",
                 NULL, NULL,
         },
         [OPT_DELIM] = {
@@ -1480,7 +1642,7 @@ static struct sr_option options[] = {
         },
         [OPT_COMMENT] = {
                 "comment_leader", "Comment leader character",
-               "The text which starts comments at the end of text lines.",
+               "The text which starts comments at the end of text lines, semicolon by default.",
                 NULL, NULL,
         },
         [OPT_MAX] = ALL_ZERO,