encoding.csv: add a sequential reader too (suitable for very large .csv files, it does not read everything at once) (#20140)

2025-09-13 22:42:26 +03:00 · 2023-12-10 23:57:08 +01:00 · 2023-12-10 23:57:08 +01:00 · cfcbcb416a
commit cfcbcb416a
parent 99d9473643
4 changed files with 439 additions and 10 deletions
--- a/vlib/encoding/csv/csv_reader_sequential.v
+++ b/vlib/encoding/csv/csv_reader_sequential.v
@ -0,0 +1,297 @@
+/*
+csv serial reader 1.0 alpha
+
+Copyright (c) 2023 Dario Deledda. All rights reserved.
+Use of this source code is governed by an MIT license
+that can be found in the LICENSE file.
+
+Known limitations:
+*/
+module csv
+
+import os
+
+@[params]
+pub struct SequentialReaderConfig {
+	scr_buf      voidptr // pointer to the buffer of data
+	scr_buf_len  i64     // if > 0 use the RAM pointed by scr_buf as source of data
+	file_path    string
+	start_index  i64
+	end_index    i64    = -1
+	mem_buf_size int    = 1024 * 64 // default buffer size 64KByte
+	separator    u8     = `,`
+	comment      u8     = `#` // every line that start with the comment char is ignored
+	default_cell string = '*' // return this string if out of the csv boundaries
+	empty_cell   string // return this string if empty cell
+	end_line_len int = endline_cr_len // size of the endline rune
+	quote        u8  = `"` // double quote is the standard quote char
+}
+
+pub struct SequentialReader {
+pub mut:
+	index i64
+
+	f              os.File
+	f_len          i64
+	is_bom_present bool
+
+	start_index i64
+	end_index   i64 = -1
+
+	end_line      u8  = `\n`
+	end_line_len  int = endline_cr_len // size of the endline rune \n = 1, \r\n = 2
+	separator     u8  = `,` // comma is the default separator
+	separator_len int = 1 // size of the separator rune
+	quote         u8  = `"` // double quote is the standard quote char
+
+	comment u8 = `#` // every line that start with the quote char is ignored
+
+	default_cell string = '*' // return this string if out of the csv boundaries
+	empty_cell   string = '#' // retunrn this if empty cell
+	// ram buffer
+	mem_buf_type  u32 // buffer type 0=File,1=RAM
+	mem_buf       voidptr // buffer used to load chars from file
+	mem_buf_size  i64     // size of the buffer
+	mem_buf_start i64 = -1 // start index in the file of the read buffer
+	mem_buf_end   i64 = -1 // end index in the file of the read buffer
+
+	ch_buf []u8 = []u8{cap: 1024}
+	// error management
+	row_count i64
+	col_count i64
+}
+
+// csv_sequential_reader creates a sequential csv reader
+pub fn csv_sequential_reader(cfg SequentialReaderConfig) !&SequentialReader {
+	mut cr := &SequentialReader{}
+
+	cr.start_index = cfg.start_index
+	cr.end_index = cfg.end_index
+
+	// reading from a RAM buffer
+	if cfg.scr_buf != 0 && cfg.scr_buf_len > 0 {
+		cr.mem_buf_type = ram_csv // RAM buffer
+		cr.mem_buf = cfg.scr_buf
+		cr.mem_buf_size = cfg.scr_buf_len
+		if cfg.end_index == -1 {
+			cr.end_index = cfg.scr_buf_len
+		}
+
+		// check if BOM header is in the memory buffer
+		unsafe {
+			if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
+				&& *(&u8(cr.mem_buf) + 2) == 0xBF {
+				cr.is_bom_present = true
+				cr.index += 3 // skip the BOM
+				cr.start_index += 3 // skip the BOM
+			}
+		}
+		cr.mem_buf_start = 0
+		cr.mem_buf_end = cr.mem_buf_size
+
+		// check if is a file source
+	} else if cfg.file_path.len > 0 {
+		if !os.exists(cfg.file_path) {
+			return error('ERROR: file ${cfg.file_path} not found!')
+		}
+		cr.mem_buf_type = file_csv // File buffer
+		// allocate the memory
+		unsafe {
+			cr.mem_buf = malloc(cfg.mem_buf_size)
+			cr.mem_buf_size = cfg.mem_buf_size
+		}
+		cr.f = os.open_file(cfg.file_path, 'rb')!
+
+		cr.f.seek(0, .end)!
+		cr.f_len = cr.f.tell()!
+
+		cr.f.seek(cfg.start_index, .start)!
+		cr.index = cr.f.tell()!
+
+		if cfg.end_index == -1 {
+			cr.end_index = cr.f_len
+		}
+
+		// check if BOM header is in the file
+		if cr.index == 0 {
+			if cr.f.read_into_ptr(cr.mem_buf, 4)! == 4 {
+				unsafe {
+					if *&u8(cr.mem_buf) == 0xEF && *(&u8(cr.mem_buf) + 1) == 0xBB
+						&& *(&u8(cr.mem_buf) + 2) == 0xBF {
+						cr.is_bom_present = true
+						cr.index += 3 // skip the BOM
+						cr.start_index += 3 // skip the BOM
+					}
+				}
+			}
+			cr.f.seek(cfg.start_index, .start)!
+		}
+	}
+
+	cr.default_cell = cfg.default_cell
+	cr.empty_cell = cfg.empty_cell
+	cr.end_line_len = cfg.end_line_len
+	cr.separator = cfg.separator
+	cr.comment = cfg.comment
+	cr.quote = cfg.quote
+
+	return cr
+}
+
+// dispose_csv_reader release the resources used by the csv_reader
+pub fn (mut cr SequentialReader) dispose_csv_reader() {
+	if cr.mem_buf_type == ram_csv {
+		// do nothing, ram buffer is static
+	} else if cr.mem_buf_type == file_csv {
+		// file close
+		if cr.f.is_opened {
+			cr.f.close()
+		}
+
+		// free the allocated memory
+		if cr.mem_buf_size > 0 {
+			unsafe {
+				free(cr.mem_buf)
+			}
+			cr.mem_buf = unsafe { nil }
+			cr.mem_buf_size = 0
+		}
+	}
+}
+
+// has_data return the bytes available for future readings
+pub fn (mut cr SequentialReader) has_data() i64 {
+	return cr.end_index - cr.start_index
+}
+
+fn (mut cr SequentialReader) fill_buffer(index i64) ! {
+	if cr.mem_buf_type == ram_csv {
+		// for now do nothing if ram buffer
+	} else {
+		cr.f.seek(index, .start)!
+		// IMPORTANT: add 64 bit support in vlib!!
+		read_bytes_count := cr.f.read_into_ptr(cr.mem_buf, int(cr.mem_buf_size))!
+		cr.mem_buf_start = index
+		cr.mem_buf_end = index + read_bytes_count
+	}
+}
+
+enum SequentialReadingState as u16 {
+	comment
+	quote
+	after_quote
+	cell
+	newline
+}
+
+// get_next_row get the next row from the CSV file as a string array
+pub fn (mut cr SequentialReader) get_next_row() ![]string {
+	mut row_res := []string{}
+	// clear the cell buffer
+	cr.ch_buf.clear()
+	mut i := cr.start_index
+	mut state := SequentialReadingState.cell
+
+	p := &u8(cr.mem_buf)
+	for i < cr.end_index {
+		if i < cr.mem_buf_start || i >= cr.mem_buf_end {
+			cr.fill_buffer(i)!
+		}
+		unsafe {
+			ch := *(p + i - cr.mem_buf_start)
+
+			if state == .cell {
+				if ch == cr.separator {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+				} else if cr.ch_buf.len == 0 && ch == cr.comment && row_res.len == 0 {
+					state = .comment
+				} else if ch == cr.quote {
+					state = .quote
+					cr.ch_buf.clear()
+					cr.col_count++
+					i++
+					continue
+				} else if ch == cr.end_line {
+					cr.row_count++
+					cr.col_count = 0
+
+					// skip empty rows
+					if !(row_res.len == 0 && cr.ch_buf.len < 1) {
+						cr.ch_buf << 0
+						row_res << if (cr.ch_buf.len - 1) == 0 {
+							cr.empty_cell
+						} else {
+							(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+						}
+						i += cr.end_line_len - 1
+						break
+					}
+				} else if ch == `\r` && cr.end_line_len == 2 {
+					// skip CR
+				} else { // normal char inside a cell
+					cr.ch_buf << ch
+				}
+			}
+
+			if state == .comment {
+				if cr.ch_buf.len > 0 {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+				} else if ch == cr.end_line {
+					state = .cell
+				}
+			}
+
+			if state == .quote {
+				if ch == cr.quote {
+					// must be optimized
+					cr.ch_buf << 0
+					row_res << if (cr.ch_buf.len - 1) == 0 {
+						cr.empty_cell
+					} else {
+						(tos(cr.ch_buf.data, cr.ch_buf.len - 1).clone())
+					}
+					cr.ch_buf.clear()
+
+					state = .after_quote
+					cr.col_count++
+					i++
+					continue
+				} else if ch == cr.end_line {
+					return error('ERROR: quote not closed at row ${cr.row_count} after column ${cr.col_count}!')
+				} else { // normal char inside a quote inside a cell
+					cr.ch_buf << ch
+				}
+			}
+
+			if state == .after_quote {
+				if ch == cr.separator {
+					state = .cell
+				} else if ch == cr.end_line {
+					cr.row_count++
+					cr.col_count = 0
+					cr.ch_buf.clear()
+					i += cr.end_line_len - 1
+					break
+				}
+			}
+		}
+		cr.col_count++
+		i++
+	}
+	cr.start_index = i
+	return row_res
+}