Unfortunately, the .csv files AmsterdamUMCdb is distributed as are encoded as unicode-escape, not as utf-8. However, polars (which reprodICU is built upon) only has support for utf-8 encoding. Thus, one needs to convert the .csv files beforehand.
To do this, run the following script
import csv
import gzip
path = [
"admissions.csv",
"drugitems.csv",
"freetextitems.csv",
"listitems.csv",
"procedureorderitems.csv",
"processitems.csv",
]
for p in path:
with open(
p, "r", encoding="unicode-escape", errors="ignore"
) as infile, open(p + "_.csv", "w", encoding="utf-8") as outfile:
print(f"Converting {p} to csv")
inputs = csv.reader(infile)
output = csv.writer(outfile)
for index, row in enumerate(inputs):
output.writerow(row)
path_gz = "numericitems.csv.gz"
with gzip.open(
path_gz, "rt", encoding="unicode-escape", errors="ignore"
) as infile, gzip.open(path_gz + "_.csv.gz", "wt", encoding="utf-8") as outfile:
inputs = csv.reader(infile)
output = csv.writer(outfile)
for index, row in enumerate(inputs):
output.writerow(row)
Unfortunately, the headers of the https://physionet.org/content/mimiciii-demo/1.4/ dataset are in lowercase and thus incompatible with the MIMIC-III processing pipeline in reprodICU.
To fix this, run the following shell command: (which turns the headers UPPERCASE)
for file in *; do awk 'NR ,==1{$0=toupper($0)} 1' $file > ../headerfix/$file; done
To extract the minute-resolution values from the raw_data column of the data_float_h table, run the following unpacking script. The data is saved in the data_float_m table.
(modified from source: https://github.com/nrodemund/sicdb/blob/main/Scripts/Unpack raw data/unpack.py for progress tracking → runtime approx. 2.5h)
import csv
import gzip
import os
import struct
os.chdir(os.path.dirname(os.path.abspath(__file__)))
def set_raw_values(row, dictwriter, n):
t = int(row["Offset"])
data = bytes.fromhex(row["rawdata"][2:]) # deserialize hex string to bytes
for i in range(int(len(data) / 4)):
if (
data[i * 4] == 0
and data[i * 4 + 1] == 0
and data[i * 4 + 2] == 0
and data[i * 4 + 3] == 0
):
continue # no null values
n = n + 1 # new primary key
newrow = row.copy()
del newrow["rawdata"] # not needed
del newrow["cnt"] # not needed
newrow["id"] = n # primary key
newrow["Val"] = struct.unpack("<f", data[i * 4 : i * 4 + 4])[
0
] # bytes to float
newrow["Offset"] = t + i * 60 # new offset
dictwriter.writerow(newrow)
return n
n = 0
N = 2.6e9 # approx. 2.6 billion entries
with gzip.open("data_float_m.csv.gz", "wt") as csvfile:
dict_writer = csv.DictWriter(
csvfile, ["id", "CaseID", "DataID", "Offset", "Val"]
)
dict_writer.writeheader()
with gzip.open("data_float_h.csv.gz", "rt", encoding="utf-8") as gzf:
for row in csv.DictReader(gzf):
n = set_raw_values(row, dict_writer, n)
if n % 1e6 == 0:
print(f"Processing entry {n:_.0f} ({n/N:6.1%})", end="\\r")