From: Gerhard Sittig <redacted>
Date: Sun, 5 Sep 2021 10:45:26 +0000 (+0200)
Subject: input/vcd: detect and skip string data types (value not used)
X-Git-Url: https://sigrok.org/gitweb/?p=libsigrok.git;a=commitdiff_plain;h=c1310f7deba7ac1f4ce75fbb5cc9b498ee66aa49

input/vcd: detect and skip string data types (value not used)

The nMigen utility was reported to emit string values in VCD files (by
means of the pyvcd Python module). The $var declaration specifies the
'string' data type, and text values are prefixed with 's' similar to
integer or real numbers and bit vectors.

It is desirable to process these files and skip the unsupported type
which does not map to any sigrok channel type. Add support to detect the
type in declarations and change entries, skip these change entries and
emit warnings so that users remain aware of the incomplete import. Check
the validity of the entry though, to keep robustness of the existing
implementation of the VCD input module.

The validity check is an educated guess that was written in the absence
of an official specification or any public documentation, encodes rather
arbitrary assumptions. Deals with cases that were reported, may need
more test coverage and future tweaking. This version forbids whitespace
in values, and exclusively accepts printable ASCII in the string
presentation, with a few escape sequences. Empty strings are covered.

This also resolves bug #1757.

Reported-By: Anton Blanchard <redacted> via github PR 157
---

diff --git a/src/input/vcd.c b/src/input/vcd.c
index 2cb4eeed..41371230 100644
--- a/src/input/vcd.c
+++ b/src/input/vcd.c
@@ -821,6 +821,7 @@ static int parse_header_var(struct context *inc, char *contents)
 	size_t length;
 	char *type, *size_txt, *id, *ref, *idx;
 	gboolean is_reg, is_wire, is_real, is_int;
+	gboolean is_str;
 	enum sr_channeltype ch_type;
 	size_t size, next_size;
 	struct vcd_channel *vcd_ch;
@@ -849,11 +850,19 @@ static int parse_header_var(struct context *inc, char *contents)
 	is_wire = g_strcmp0(type, "wire") == 0;
 	is_real = g_strcmp0(type, "real") == 0;
 	is_int = g_strcmp0(type, "integer") == 0;
+	is_str = g_strcmp0(type, "string") == 0;
 
 	if (is_reg || is_wire) {
 		ch_type = SR_CHANNEL_LOGIC;
 	} else if (is_real || is_int) {
 		ch_type = SR_CHANNEL_ANALOG;
+	} else if (is_str) {
+		sr_warn("Skipping id %s, name '%s%s', unsupported type '%s'.",
+			id, ref, idx ? idx : "", type);
+		inc->ignored_signals = g_slist_append(inc->ignored_signals,
+			g_strdup(id));
+		g_strfreev(parts);
+		return SR_OK;
 	} else {
 		sr_err("Unsupported signal type: '%s'", type);
 		g_strfreev(parts);
@@ -1510,6 +1519,95 @@ static uint8_t vcd_char_to_value(char bit_char, int *warn)
 	return ~0;
 }
 
+/*
+ * Check the validity of a VCD string value. It's essential to reliably
+ * accept valid data which the community uses in the field, yet robustly
+ * reject invalid data for users' awareness. Since IEEE 1800-2017 would
+ * not discuss the representation of this data type, it's assumed to not
+ * be an official feature of the VCD file format. This implementation is
+ * an educated guess after inspection of other arbitrary implementations,
+ * not backed by any specification or public documentation.
+ *
+ * A quick summary of the implemented assumptions: Must be a sequence of
+ * ASCII printables. Must not contain whitespace. Might contain escape
+ * sequences: A backslash followed by a single character, like '\n' or
+ * '\\'. Or a backslash and the letter x followed by two hex digits,
+ * like '\x20'. Or a backslash followed by three octal digits, like
+ * '\007'. As an exception also accepts a single digit '\0' but only at
+ * the text end. The string value may be empty, but must not be NULL.
+ *
+ * This implementation assumes an ASCII based platform for simplicity
+ * and readability. Should be a given on sigrok supported platforms.
+ */
+static gboolean vcd_string_valid(const char *s)
+{
+	char c;
+
+	if (!s)
+		return FALSE;
+
+	while (*s) {
+		c = *s++;
+		/* Reject non-printable ASCII chars including DEL. */
+		if (c < ' ')
+			return FALSE;
+		if (c > '~')
+			return FALSE;
+		/* Deeper inspection of escape sequences. */
+		if (c == '\\') {
+			c = *s++;
+			switch (c) {
+			case 'a': /* BEL, bell aka "alarm" */
+			case 'b': /* BS, back space */
+			case 't': /* TAB, tabulator */
+			case 'n': /* NL, newline */
+			case 'v': /* VT, vertical tabulator */
+			case 'f': /* FF, form feed */
+			case 'r': /* CR, carriage return */
+			case '"': /* double quotes */
+			case '\'': /* tick, single quote */
+			case '?': /* question mark */
+			case '\\': /* backslash */
+				continue;
+			case 'x': /* \xNN two hex digits */
+				c = *s++;
+				if (!g_ascii_isxdigit(c))
+					return FALSE;
+				c = *s++;
+				if (!g_ascii_isxdigit(c))
+					return FALSE;
+				continue;
+			case '0': /* \NNN three octal digits */
+			case '1':
+			case '2':
+			case '3':
+			case '4':
+			case '5':
+			case '6':
+			case '7':
+				/* Special case '\0' at end of text. */
+				if (c == '0' && !*s)
+					return TRUE;
+				/*
+				 * First digit was covered by the outer
+				 * switch(). Two more digits to check.
+				 */
+				c = *s++;
+				if (!g_ascii_isdigit(c) || c > '7')
+					return FALSE;
+				c = *s++;
+				if (!g_ascii_isdigit(c) || c > '7')
+					return FALSE;
+				continue;
+			default:
+				return FALSE;
+			}
+		}
+	}
+
+	return TRUE;
+}
+
 /* Parse one text line of the data section. */
 static int parse_textline(const struct sr_input *in, char *lines)
 {
@@ -1518,7 +1616,8 @@ static int parse_textline(const struct sr_input *in, char *lines)
 	char **words;
 	size_t word_count, word_idx;
 	char *curr_word, *next_word, curr_first;
-	gboolean is_timestamp, is_section, is_real, is_multibit, is_singlebit;
+	gboolean is_timestamp, is_section;
+	gboolean is_real, is_multibit, is_singlebit, is_string;
 	uint64_t timestamp;
 	char *identifier, *endptr;
 	size_t count;
@@ -1697,6 +1796,7 @@ static int parse_textline(const struct sr_input *in, char *lines)
 		 * timestamp.
 		 *
 		 * Supported input data formats are:
+		 * - S<value> <sep> <id> (value not used, VCD type 'string').
 		 * - R<value> <sep> <id> (analog channel, VCD type 'real').
 		 * - B<value> <sep> <id> (analog channel, VCD type 'integer').
 		 * - B<value> <sep> <id> (logic channels, VCD bit vectors).
@@ -1725,6 +1825,7 @@ static int parse_textline(const struct sr_input *in, char *lines)
 		is_singlebit |= curr_first == 'l' || curr_first == 'h';
 		is_singlebit |= curr_first == 'x' || curr_first == 'z';
 		is_singlebit |= curr_first == 'u' || curr_first == '-';
+		is_string = curr_first == 's';
 		if (is_real) {
 			char *real_text;
 			float real_val;
@@ -1872,6 +1973,32 @@ static int parse_textline(const struct sr_input *in, char *lines)
 			process_bits(inc, identifier, inc->conv_bits.value, 1);
 			continue;
 		}
+		if (is_string) {
+			const char *str_value;
+
+			str_value = &curr_word[1];
+			identifier = next_word;
+			word_idx++;
+			if (!vcd_string_valid(str_value)) {
+				sr_err("Invalid string data: %s", str_value);
+				ret = SR_ERR_DATA;
+				break;
+			}
+			if (!identifier || !*identifier) {
+				sr_err("String value without identifier.");
+				ret = SR_ERR_DATA;
+				break;
+			}
+			sr_spew("Got string data, id '%s', value \"%s\".",
+				identifier, str_value);
+			if (!is_ignored(inc, identifier)) {
+				sr_err("String value for identifier '%s'.",
+					identifier);
+				ret = SR_ERR_DATA;
+				break;
+			}
+			continue;
+		}
 
 		/* Design choice: Consider unsupported input fatal. */
 		sr_err("Unknown token '%s'.", curr_word);