I am new to Elixir, learning it. I have a list of US Cities from this github link https://raw.githubusercontent.com/grammakov/USA-cities-and-states/master/us_cities_states_counties.csv
I am trying to process this, remove the duplicate after removing the last two columns.
I have a NodeJS script doing the same but it’s way faster the elixir. Can anyone help me improve it.
Elixir : 55.9859 sec | Node: 4.2381143870018425 sec
Elixir Script.
#!/usr/bin/env elixir
defmodule Benchmark do
def measure(function) do
time =
function
|> :timer.tc()
|> elem(0)
|> Kernel./(1_000_000)
|> to_string()
IO.puts(time <> " sec")
end
end
defmodule UsCitiesStates do
def uniq([]), do: []
def uniq([head | tail]) do
[head | for(x <- uniq(tail), x != head, do: x)]
end
def process() do
IO.puts("stated...")
list =
File.stream!("./us_cities_states_counties.csv")
|> Stream.map(&String.trim(&1))
|> Stream.map(&String.split(&1, "|"))
|> Stream.filter(fn
["city" | _] -> false
_ -> true
end)
|> Stream.map(&(Stream.drop(&1, -2) |> Enum.to_list()))
cities =
list
|> Enum.to_list()
|> uniq()
IO.inspect(cities)
IO.inspect(length(cities))
end
end
Benchmark.measure(&UsCitiesStates.process/0)
Elxir Result
stated...
[
["Holtsville", "NY", "New York"],
["Adjuntas", "PR", "Puerto Rico"],
["Aguirre", "PR", "Puerto Rico"],
["Aibonito", "PR", ...],
["Maunabo", ...],
[...],
...
]
29860
55.9859 sec
Node
import fs from "node:fs";
import readline from "node:readline/promises";
import { PerformanceObserver, performance } from "node:perf_hooks";
const perf = new PerformanceObserver((list) => {
for (const entry of list.getEntries()) {
console.log(entry.duration / 1000, "sec");
}
});
perf.observe({ entryTypes: ["measure"], buffered: true });
(async function main() {
try {
performance.mark("start");
const data = [];
const rl = readline.createInterface({
input: fs.createReadStream("./us_cities_states_counties.csv"),
});
const clean = (msg, err) => {
console.log(msg);
if (msg === "error") console.log("error", err || "none");
if (msg === "close") {
performance.mark("end");
performance.measure("fs-stat", "start", "end");
console.log(data);
}
};
rl.on("line", (ln) => {
const d = ln.toString().trim();
if (d === "city|state_short|state_full|county|city_alias") return;
const city = d.split("|").slice(0, -2).join(",");
if (data.includes(city)) return;
data.push(city);
});
rl.on("close", () => clean("close"));
} catch (e) {
console.log(e);
}
})();
Node Result
close
[
[ 'Holtsville', 'NY', 'New York' ],
[ 'Adjuntas', 'PR', 'Puerto Rico' ],
[ 'Aguada', 'PR', 'Puerto Rico' ],
... 29760 more items
]
4.2381143870018425 sec