aboutsummaryrefslogtreecommitdiffstats
path: root/makedb.ml
blob: dadbdf0e94a4c24e115752f360f915c765a73c3a (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
(** Init script for the SQLite database. **)
let inits = "\
	DROP TABLE IF EXISTS elements; \
	DROP TABLE IF EXISTS searchtable; \
	CREATE TABLE elements ( \
		num INTEGER PRIMARY KEY NOT NULL, \
		symb TEXT UNIQUE NOT NULL, \
		name TEXT UNIQUE NOT NULL, \
		mass FLOAT, \
		cpk_color TEXT, \
		e_config TEXT, \
		e_neg FLOAT, \
		rad FLOAT, \
		ionization_e FLOAT, \
		e_affinity FLOAT, \
		oxidation TEXT, \
		state TEXT, \
		melting FLOAT, \
		boiling FLOAT, \
		density FLOAT, \
		block TEXT, \
		discovered INTEGER \
	) WITHOUT ROWID; \
	CREATE TABLE searchtable ( \
		name TEXT PRIMARY KEY NOT NULL, \
		dat BLOB NOT NULL) WITHOUT ROWID; \
"

(** Converts the column data in the PubChem JSON file to
    a string option array. *)
let toarr json = let open Yojson.Basic.Util in
	json |> member "Table" |> member "Columns" |> member "Column"
	|> to_list |> filter_string |> Array.of_list 
(** The PubChem JSON file is compressed by keeping all key values in
    a single array at the top called [Column]. *)

(** Ignores all rows or throws an exception, then resets [stmt]. *)
let allstep stmt = let open Sqlite3 in
	let rec f = function
	| Rc.ROW -> f (step stmt)
	| x -> Rc.check x
	in
	f (step stmt);
	reset stmt |> Rc.check

(** Initializes [db] from [json]. *)
let makedb db json = let open Sqlite3
	in let () = exec db inits |> Rc.check
	in let symbs = ref []
	in let names = ref []
	in let colarr = toarr json
	in let stmt = "INSERT INTO elements \
		(num, \
		symb, \
		name, \
		mass, \
		cpk_color, \
		e_config, \
		e_neg, \
		rad, \
		ionization_e, \
		e_affinity, \
		oxidation, \
		state, \
		melting, \
		boiling, \
		density, \
		block, \
		discovered) \
		VALUES \
		(:AtomicNumber, \
		:Symbol, \
		:Name, \
		:AtomicMass, \
		:CPKHexColor, \
		:ElectronConfiguration, \
		:Electronegativity, \
		:AtomicRadius, \
		:IonizationEnergy, \
		:ElectronAffinity, \
		:OxidationStates, \
		:StandardState, \
		:MeltingPoint, \
		:BoilingPoint, \
		:Density, \
		:GroupBlock, \
		:YearDiscovered);"
	|> prepare db

	in let binds stmt s v = bind_name stmt (":" ^ s) v |> Rc.check

	(* Bind each column name. In SQLite3 the leading ":" is part
	of the named parameter. *)
	in let prep i x =
		let lcase = String.lowercase_ascii
		in let x = (match colarr.(i) with
		| "Symbol" -> symbs := (lcase x,None)::!symbs; lcase x
		| "Name" -> names := (lcase x,None)::!names; lcase x
		| "YearDiscovered" -> if x = "Ancient" then "" else x
		| _ -> x
		) in
		binds stmt colarr.(i)
		    (if x = "" then Data.NULL else Data.TEXT x)

	(* Cache fuzzy matching automaton. *)
	in let do_model stmt name vals =
		let open Batch_jaro_winkler
		in let model = build_exportable_model
			~encoding:Encoding.UTF8 vals
		in
		binds stmt "name" (Data.TEXT name);
		binds stmt "cache" (Data.BLOB model);
		allstep stmt

	in let open Yojson.Basic.Util
	in let access_data x = x |> member "Cell" |> to_list
		|> filter_string |> List.iteri prep;
		allstep stmt
	in json |> member "Table" |> member "Row" |> to_list
		|> List.iter access_data;
		finalize stmt |> Rc.check;
	let stmt = "INSERT INTO searchtable VALUES (:name, :cache);"
		|> prepare db
	in
	do_model stmt "symb" !symbs;
	do_model stmt "name" !names;
	finalize stmt |> Rc.check