Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

switch from XMLDict.jl to XML.jl #125

Draft
wants to merge 3 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ QuadGK = "1fd47b50-473d-5c70-9696-f719f8f3bcdc"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
StringDistances = "88034a9c-02f8-509d-84a9-84ec65e18404"
UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
XML = "72c71f33-b9b6-44de-8c94-c961784809e2"
XMLDict = "228000da-037f-5747-90a9-8195ccbf91a5"

[compat]
Expand Down
2 changes: 2 additions & 0 deletions src/MortalityTables.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ using Requires
import StringDistances
using UnPack
using XMLDict
import XML
using Pkg.Artifacts

include("table_source_map.jl")
Expand Down Expand Up @@ -35,6 +36,7 @@ export MortalityTable,
Constant,
DeathDistribution,
get_SOA_table,
get_SOA_table2,
Makeham, Gompertz, MakehamGompertz,
hazard,cumhazard,
mortality_vector
Expand Down
234 changes: 194 additions & 40 deletions src/XTbML.jl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ function open_and_read(path)
bytes = read(path)
if bytes[1:3] == [0xef, 0xbb, 0xbf]
# Why skip the first three bytes of the response?

# From https://docs.python.org/3/library/codecs.html
# To increase the reliability with which a UTF-8 encoding can be detected,
# Microsoft invented a variant of UTF-8 (that Python 2.5 calls "utf-8-sig")
Expand All @@ -18,9 +18,9 @@ function open_and_read(path)
end

function getXML(open_file)

return xml = XMLDict.xml_dict(open_file)

end

# get potentially missing value out of dict
Expand Down Expand Up @@ -53,61 +53,215 @@ function parseXTbMLTable(x, path)
comments = get(md, "Comments", nothing) |> strip
source_path = path
d = TableMetaData(
name=name,
id=id,
provider=provider,
reference=reference,
content_type=content_type,
description=description,
comments=comments,
source_path=source_path,
name=name,
id=id,
provider=provider,
reference=reference,
content_type=content_type,
description=description,
comments=comments,
source_path=source_path,
)

if isa(x["XTbML"]["Table"], Vector)
# for a select and ultimate table, will have multiple tables
# parsed into a vector of tables
sel = map(x["XTbML"]["Table"][1]["Values"]["Axis"]) do ai
(issue_age = Parsers.parse(Int, ai[:t]),
rates = [(duration = Parsers.parse(Int, aj[:t]), rate = get_and_parse(aj, "")) for aj in ai["Axis"]["Y"] if !ismissing(get_and_parse(aj, ""))])
rates = [(duration = Parsers.parse(Int, aj[:t]), rate = get_and_parse(aj, "")) for aj in ai["Axis"]["Y"] if !ismissing(get_and_parse(aj, ""))])
end

ult = map(x["XTbML"]["Table"][2]["Values"]["Axis"]["Y"]) do ai
(age = Parsers.parse(Int, ai[:t]), rate = get_and_parse(ai, ""),)
end

else
# a table without select period will just have one set of values

ult = map(x["XTbML"]["Table"]["Values"]["Axis"]["Y"]) do ai
(age = Parsers.parse(Int, ai[:t]),
rate = get_and_parse(ai, ""))
rate = get_and_parse(ai, ""))
end

sel = nothing

end

tbl = XTbMLTable(
sel,
ult,
d
sel,
ult,
d
)

return tbl
end

function parseXTbMLTable2(x, path="")
# md = x["XTbML"]["ContentClassification"]
# name = get(md, "TableName", nothing) |> strip
# content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip
# id = get(md, "TableIdentity", nothing) |> strip
# provider = get(md, "ProviderName", nothing) |> strip
# reference = get(md, "TableReference", nothing) |> strip
# description = get(md, "TableDescription", nothing) |> strip
# comments = get(md, "Comments", nothing) |> strip
# source_path = path
# d = TableMetaData(
# name=name,
# id=id,
# provider=provider,
# reference=reference,
# content_type=content_type,
# description=description,
# comments=comments,
# source_path=source_path,
# )
d=TableMetaData()
if length(XML.children(x[2])) > 2 # ["XTbML"]["Table"]
# for a select and ultimate table, will have multiple tables
# parsed into a vector of tables
sel = map(XML.children(x[2][2][2])) do ai
(
issue_age = Parsers.parse(Int, XML.attributes(ai)["t"]),
rates = let
rs = map(XML.children(ai[1])) do aj # ["Values"]

(
duration = Parsers.parse(Int, XML.attributes(aj)["t"]) ,
rate = length(XML.children(aj)) == 0 ? missing : Parsers.parse(Float64,XML.value(aj[1]))
)
end
filter!(y->!ismissing(y.rate),rs)

end
)
end
ult = map(XML.children(x[2][3][2][1])) do ai
(
age = Parsers.parse(Int, XML.attributes(ai)["t"]),
rate = length(XML.children(ai)) == 0 ? missing : Parsers.parse(Float64,XML.value(ai[1]))
)
end

else
# a table without select period will just have one set of values

ult = filter!(x->!ismissing(x.rate),map(XML.children(x.root[2][2][1])) do ai

(
age = Parsers.parse(Int, XML.attributes(ai)[:t]),
rate = length(XML.children(ai)) == 0 ? missing : Parsers.parse(Float64,XML.value(ai[1]))
)
end)

sel = nothing

end

tbl = XTbMLTable(
sel,
ult,
d
)

return tbl
end

__parse_rate(t,x) = Parsers.parse(t, x)
__parse_rate(t,x::Nothing) = missing

function parseXTbMLTable3(x, path="")
# md = x["XTbML"]["ContentClassification"]
# name = get(md, "TableName", nothing) |> strip
# content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip
# id = get(md, "TableIdentity", nothing) |> strip
# provider = get(md, "ProviderName", nothing) |> strip
# reference = get(md, "TableReference", nothing) |> strip
# description = get(md, "TableDescription", nothing) |> strip
# comments = get(md, "Comments", nothing) |> strip
# source_path = path
# d = TableMetaData(
# name=name,
# id=id,
# provider=provider,
# reference=reference,
# content_type=content_type,
# description=description,
# comments=comments,
# source_path=source_path,
# )
d=TableMetaData()

sel_start_age = nothing
dur = nothing
ia = nothing
ov = OffsetVector{Union{Missing, Float64}, Vector{Union{Missing, Float64}}}[] # a container for the offset vectors
iv = Union{Missing,Float64}[] # a non-offset container vector for the innermost values

for n in x #first(x,100)
# println(n)
if XML.tag(n) == "Axis" && !isnothing(XML.attributes(n))
# a select table
ia = Parsers.parse(Int, XML.attributes(n)["t"])
if isnothing(sel_start_age)
sel_start_age = ia
end
# @show ia

else
# an ultimate table

end

if isnothing(ia) && XML.tag(n) == "Y"
ia = Parsers.parse(Int, XML.attributes(n)["t"])
# @show "ult", ia
end

if XML.tag(n) == "Y"
p = XML.next(n)
val = if XML.nodetype(p) == XML.Text
__parse_rate(Float64,XML.value(p))
else
__parse_rate(Float64,XML.value(n))
end
# TODO? ignore trailing missings
push!(iv,val)
end

if XML.tag(n) == "Axis" && length(iv) > 0
# @show "end of vec"
# new vector. Save and reset
push!(ov,OffsetArray(iv,ia-1))
iv = Union{Missing,Float64}[]
ia = nothing
end
end

# wrap up:
# - if there are multiple vectors then its a select table and the last one is the ultimate rates

# tbl = XTbMLTable(
# sel,
# ult,
# d
# )

return ov
end

function XTbML_Table_To_MortalityTable(tbl::XTbMLTable)
ult = UltimateMortality(
[v.rate for v in tbl.ultimate],
start_age=tbl.ultimate[1].age
)

[v.rate for v in tbl.ultimate],
start_age=tbl.ultimate[1].age
)
ult_omega = lastindex(ult)

if !isnothing(tbl.select)
sel = map(tbl.select) do (issue_age, rates)
last_sel_age = issue_age + rates[end].duration - 1
first_defined_select_age = issue_age + rates[1].duration - 1
last_sel_age = issue_age + last(rates).duration - 1
first_defined_select_age = issue_age + first(rates).duration - 1
last_age = max(last_sel_age, ult_omega)
vec = map(issue_age:last_age) do attained_age
if attained_age < first_defined_select_age
Expand All @@ -123,15 +277,15 @@ function XTbML_Table_To_MortalityTable(tbl::XTbMLTable)
return mortality_vector(vec, start_age=issue_age)
end
sel = OffsetArray(sel, tbl.select[1].issue_age - 1)

return MortalityTable(sel, ult, metadata=tbl.d)
else
return MortalityTable(ult, metadata=tbl.d)
end
end

"""
readXTbML(path)
readXTbML(path)

Loads the [XtbML](https://mort.soa.org/About.aspx) (the SOA XML data format for mortality tables) stored at the given path and returns a `MortalityTable`.
"""
Expand All @@ -144,7 +298,7 @@ end
# Load Available Tables ###

"""
read_tables(dir=nothing)
read_tables(dir=nothing)

Loads the [XtbML](https://mort.soa.org/About.aspx) (the SOA XML data format for mortality tables) stored in the given path. If no path is specified, will load the packages in the MortalityTables package directory. To see where your system keeps packages, run `DEPOT_PATH` from a Julia REPL.
"""
Expand All @@ -170,18 +324,18 @@ end

# this is used to generate the table mapping in table_source_map.jl
function _write_available_tables()
table_dir = artifact"mort.soa.org"
table_dir = artifact"mort.soa.org"
tables = []
@info "Loading built-in Mortality Tables..."
for (root, dirs, files) in walkdir(table_dir)
for file in files
if endswith(file,".xml") && !startswith(file,".")
x = open_and_read(joinpath(root,file)) |> XMLDict.xml_dict
md = x["XTbML"]["ContentClassification"]
name = get(md, "TableName", nothing) |> strip
content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip
id = get(md, "TableIdentity", nothing) |> strip
push!(tables,(source="mort.soa.org",name=name,id=Parsers.parse(Int,id)))
x = open_and_read(joinpath(root,file)) |> XMLDict.xml_dict
md = x["XTbML"]["ContentClassification"]
name = get(md, "TableName", nothing) |> strip
content_type = get(get(md, "ContentType", nothing), "", nothing) |> strip
id = get(md, "TableIdentity", nothing) |> strip
push!(tables,(source="mort.soa.org",name=name,id=Parsers.parse(Int,id)))
end
end
end
Expand Down
24 changes: 24 additions & 0 deletions src/get_SOA_table.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,30 @@ function get_SOA_table(id::Int)
readXTbML(joinpath(artifact"mort.soa.org", "t$id.xml"))
end

function get_SOA_table2(id::Int)
path = joinpath(artifact"mort.soa.org", "t$id.xml")
leading_bytes = read(path,3)
skipbytes = leading_bytes == [0xef, 0xbb, 0xbf]
# Why skip the first three bytes of the response?

# From https://docs.python.org/3/library/codecs.html
# To increase the reliability with which a UTF-8 encoding can be detected,
# Microsoft invented a variant of UTF-8 (that Python 2.5 calls "utf-8-sig")
# for its Notepad program: Before any of the Unicode characters is written
# to the file, a UTF-8 encoded BOM (which looks like this as a byte sequence:
# 0xef, 0xbb, 0xbf) is written.

x = open(path,"r") do f
skipbytes && skip(f,3)

XML.LazyNode(XML.Raw(XML.Mmap.mmap(f)))
end

t = parseXTbMLTable3(x,path)
# XTbML_Table_To_MortalityTable(t)

end

function get_SOA_table(table_name::String; source_map = table_source_map)
entry = get(source_map, table_name, nothing)
if entry === nothing
Expand Down