-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathapp.py
297 lines (259 loc) · 10.4 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
# IMPORTING TOOLS
import streamlit as st
from rdflib import Graph, Literal
from rdflib.plugins.sparql import prepareQuery
import pandas as pd
import plotly.express as px
import numpy as np
# SET PAGE SETTINGS
st.set_page_config(page_title='Amusement Accidents', layout="centered")
# METHOD TO LOAD THE RDF
@st.cache(persist=True)
def importRDF(filename, format):
graph = Graph().parse(filename, format)
return graph
# IMPORTING THE RDF
with st.spinner('Loading all the stuffs...'):
graph = importRDF("./RDF/rdf-dataset.ttl", "ttl")
# METHOD TO CONVERT THE QUERY RESULT INTO A DATAFRAME
def sparql_results_to_df(results):
return pd.DataFrame(
data=([None if x is None else x.toPython() for x in row] for row in results),
columns=[str(x) for x in results.vars],
)
# METHOD TO EXECUTE A GENERIC QUERY
def computeQuery(query, executor):
result = executor.query(query)
res_df = sparql_results_to_df(result)
return res_df
# METHOD TO EXECUTE A PARAMETRIC QUERY
def rideAccidentDescription(ride_name, executor):
ride_name = Literal(ride_name)
query = """
PREFIX ride_type: <http://example.org/ride_type#>
PREFIX acc: <http://example.org/accident#>
PREFIX ride: <http://example.org/ride#>
SELECT (?manuf AS ?Manufacturer) (?description AS ?Accident_Description)
WHERE {
?instance acc:description ?description ;
acc:ref-ride_id ?ride_id .
?ride_id ride:name ?name ;
ride:manufacturer ?manuf .
FILTER (?name = ?ride_name)
}
"""
prep_query = prepareQuery(query)
r = executor.query(prep_query, initBindings={'ride_name': ride_name})
return sparql_results_to_df(r), query
# PROCESSING & DISPLAY
def display():
with st.container():
st.write("#### What are the months with the highest number of accidents?")
res = computeQuery(query_5, graph)
fig = px.bar(res, x="mon", y="count", color="count", labels={"mon":"Month", "count":"Num. of Accidents"}, text_auto="True")
fig.update_xaxes(type="category")
fig.update_yaxes(showticklabels=False)
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_5, language="sparql")
st.markdown("---")
with st.container():
st.write("#### Which cities and states have recorded the most accidents?")
res = computeQuery(query_8, graph)
fig = px.treemap(res, path=[px.Constant("U.S"), "state", "city"], values="count", hover_data=["state", "city","count"],
color="count",
color_continuous_scale='tealrose',
color_continuous_midpoint=np.average(res['count'], weights=res['count']))
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_8, language="sparql")
st.markdown("---")
with st.container():
st.write("#### What incidents have occurred on your favorite ride?")
ride_names = computeQuery(query_0, graph)
option = st.selectbox("Select a Ride", options=ride_names)
res, query = rideAccidentDescription(option, graph)
res_count = res.count()[0]
if (res_count < 3):
st.table(res)
else:
limit = st.slider("Num. of Accidents to Visualize", 1, int(res_count), 2, 1)
st.table(res[:limit])
with st.expander("Show query"):
st.code(query, language="sparql")
st.markdown("---")
with st.container():
st.write("#### What Are the Most Common Categories of Accidents?")
res = computeQuery(query_4, graph)
fig = px.treemap(res, path=[px.Constant("Accident Category"), "category_name"], values="count", hover_data=["category_name","count"])
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_4, language="sparql")
st.markdown("---")
with st.container():
st.write("#### What are the Most Dangerous Ride Categories?")
res = computeQuery(query_6, graph)
fig = px.pie(res, names="amus_cat_name", values="count", hole=.4)
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_6, language="sparql")
st.markdown("---")
with st.container():
st.write("#### What are the Most Dangerous Ride Types?")
res = computeQuery(query_3, graph)
fig = px.bar(res, x="type_name", y="count", labels={"type_name":"Ride Type", "count":"Num. of Accidents"}, text_auto=True)
fig.update_xaxes(tickangle=45)
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_3, language="sparql")
st.markdown("---")
with st.container():
st.write("#### How many people are generally involved in an accident?")
res = computeQuery(query_1, graph)
fig = px.bar(res, x="num_inj", y="count", labels={"num_inj":"Injured People", "count":"Num. of Accidents"}, text_auto=True)
fig.update_xaxes(type="category")
st.plotly_chart(fig, use_container_width=True)
with st.expander("Show query"):
st.code(query_1, language="sparql")
st.markdown("---")
return None
# ANALYTICAL QUERIES DEFINITION
# get the names of all the rides
query_0 = """
PREFIX ride:<http://example.org/ride#>
SELECT DISTINCT ?name
WHERE {
?ride ride:name ?name .
}
"""
# num of accidents per injured people
query_1 = """
PREFIX r:<http://example.org/ride#>
PREFIX a:<http://example.org/accident#>
SELECT ?num_inj (COUNT(?num_inj) AS ?count)
WHERE {
?acc a:num_injured ?num_inj .
}
GROUP BY ?num_inj
ORDER BY (?num_inj)
"""
# manufacturers of the rides subjected to most accidents
query_2 = """
PREFIX acc: <http://example.org/accident#>
PREFIX ride: <http://example.org/ride#>
SELECT ?ride_manuf (COUNT(?ride_manuf) AS ?count)
WHERE {
?instance acc:ref-ride_id ?ride_id .
?ride_id ride:manufacturer ?ride_manuf
}
GROUP BY ?ride_manuf
ORDER BY DESC(?count)
"""
# Top n types of rides most subjected to accidents
query_3 = """
PREFIX ride_type: <http://example.org/ride_type#>
PREFIX acc: <http://example.org/accident#>
PREFIX ride: <http://example.org/ride#>
SELECT ?type_name (COUNT(?type_name) AS ?count)
WHERE {
?instance acc:ref-ride_id ?ride_id .
?ride_id ride:ref-ride_type_id ?type_id .
?type_id ride_type:type ?type_name .
}
GROUP BY ?type_name
ORDER BY DESC(?count)
LIMIT 7
"""
# Top 6 categories of rides most subjected to accidents
query_6 = """
PREFIX amusement_cat: <http://example.org/amusement_category#>
PREFIX ride_type: <http://example.org/ride_type#>
PREFIX acc: <http://example.org/accident#>
PREFIX ride: <http://example.org/ride#>
SELECT ?amus_cat_name (COUNT(?amus_cat_name) AS ?count)
WHERE {
?instance acc:ref-ride_id ?ride_id .
?ride_id ride:ref-ride_type_id ?type_id .
?type_id ride_type:ref-amusement_category_id ?amus_cat_id .
?amus_cat_id amusement_cat:amusement_category ?amus_cat_name .
}
GROUP BY ?amus_cat_name
ORDER BY DESC(?count)
LIMIT 6
"""
# most common categories of accidents
query_4 = """
PREFIX acc_cat: <http://example.org/accident_category#>
PREFIX acc: <http://example.org/accident#>
SELECT ?category_name (COUNT(?category_name) AS ?count)
WHERE {
?instance acc:ref-accident_category_id ?category_id .
?category_id acc_cat:accident_category ?category_name .
}
GROUP BY ?category_name
ORDER BY DESC(?count)
"""
# months with the ngher num of accidents
query_5 = """
PREFIX acc: <http://example.org/accident#>
SELECT ?mon (COUNT(?mon) AS ?count)
WHERE {
?instance acc:date ?date .
}
GROUP BY (month(?date) AS ?mon)
ORDER BY (?mon)
"""
# cities with the higher num of accidents
query_8 = """
PREFIX location: <http://example.org/location#>
PREFIX acc: <http://example.org/accident#>
SELECT ?city (COUNT(?city) AS ?count) ?state
WHERE {
?instance acc:ref-location_id ?location_id .
?location_id location:city ?city ;
location:state ?state
}
GROUP BY ?city
ORDER BY DESC(?count)
"""
# TITLE
st.header("Theme Park Ride Accidents")
st.markdown("""There are **thousands of amusement parks** around the world that welcome **millions of visitors** each year.
Children, families, and teenagers are ready to spend days of adrenaline and fun.
Unfortunately, **accidents sometimes occur**. This raises some questions: **Are amusement parks safe? Which rides are the most accident-prone? What accidents happen most often? At what time of year are accidents most common?**
Let's try to find out in this **RDF data exploration** using **SPARQL** and **Plotly**.""")
st.markdown("---")
display()
# WRITE & RUN YOUR OWN QUERY
st.write("#### Write & Run your Custom Query")
pers_query = st.text_area('', """
PREFIX ride:<http://example.org/ride#>
SELECT ?name
WHERE {
?ride ride:manufacturer "Vekoma" ;
ride:name ?name
}
""", height=200)
with st.container():
try:
res = computeQuery(pers_query, graph)
st.table(res)
except:
st.error("Ooops! Check you query syntax...")
st.markdown("---")
# SIDEBAR
with st.sidebar:
st.write("""
This App proposes some visualization about theme park ride accidents.
The original dataset comes from "Saferparks", an organization that reports and collects data about theme park ride accidents in the US.
The original dataset covers years from 2010 to 2017 and comes in CSV or Excel format. I used python to split the dataset and convert it into the
Third Normal Form (3NF) of Database.
I uploaded the data into a PostgreSQL database and I used the Ontop tool to get the final RDF dataset.
Queries are expressed in SPARQL, and charts are generated with Plotly Express.
""")
st.markdown("---")
st.markdown("## Dataset Resources:")
st.markdown("""
Saferparks Original Dataset: https://ridesdatabase.org/saferparks/data/
Saferparks Dataset Description: https://ridesdatabase.org/wp-content/uploads/2020/02/Saferparks-data-description.pdf
""")