library("spflow")
library("sf")
library("spdep")
This article illustrates the use of the spflow package for modeling spatial interactions using the example of home-to-work commuting flows. For our example we use information on the 71 municipalities that are located closest to the center of Paris. This data is contained in the package and was originally diffused by the French National Institutes of Statistics and Economic Studies (INSEE), and of Geographic and Forest Information (IGN). (For more information see help(paris_data)
.)
data("paris10km_municipalities")
data("paris10km_commuteflows")
Each municipality is identified by a unique id. Additionally, we have information on the population, the median income and the number of companies.
<- names(paris10km_municipalities) != "AREA"
drop_area plot(paris10km_municipalities[drop_area])
There are three different neighborhood matrices that can be used to describe the connectivity between the municipalities.
<- par(mfrow = c(1, 3), mar = c(0,0,1,0))
old_par
<- suppressWarnings({
mid_points st_point_on_surface(st_geometry(paris10km_municipalities))})
<- list(
paris10km_nb "by_contiguity" = spdep::poly2nb(paris10km_municipalities),
"by_distance" = spdep::dnearneigh(mid_points,d1 = 0, d2 = 5),
"by_knn" = spdep::knn2nb(knearneigh(mid_points,3))
)
plot(st_geometry(paris10km_municipalities))
plot(paris10km_nb$by_contiguity, mid_points, add = T, col = rgb(0,0,0,alpha=0.5))
title("Contiguity")
plot(st_geometry(paris10km_municipalities))
plot(paris10km_nb$by_distance,mid_points, add = T, col = rgb(0,0,0,alpha=0.5))
title("Distance")
plot(st_geometry(paris10km_municipalities))
plot(paris10km_nb$by_knn, mid_points, add = T, col = rgb(0,0,0,alpha=0.5))
title("3 Nearest Neighbors")
par(old_par)
Finally, there is data on the size of the commuting flows and the distance between all pairs of municipalities.
paris10km_commuteflows #> ID_ORIG ID_DEST DISTANCE COMMUTE_FLOW
#> 1 75101 75101 0.0000 3771.235555
#> 2 75101 75102 787.6493 294.768992
#> 3 75101 75103 1734.3136 71.251159
#> 4 75101 75104 1811.4124 99.384682
#> 5 75101 75105 2268.2408 98.889146
#> 6 75101 75106 1513.1158 65.154062
#> 7 75101 75107 1908.6102 96.991438
#> 8 75101 75108 2083.9014 455.666096
#> 9 75101 75109 1610.6894 265.184461
#> 10 75101 75110 2339.9467 74.846270
#> 11 75101 75111 3241.7862 93.954966
#> 12 75101 75112 6939.4271 123.460322
#> 13 75101 75113 4262.6282 130.726524
#> 14 75101 75114 3771.3545 74.777413
#> 15 75101 75115 4061.6599 122.035858
#> 16 75101 75116 5463.6893 126.298089
#> 17 75101 75117 3509.1773 95.305200
#> 18 75101 75118 3437.6852 47.283055
#> 19 75101 75119 4485.0897 46.148335
#> 20 75101 75120 4763.9492 25.819257
#> 21 75101 92004 6908.1954 18.840256
#> 22 75101 92007 7357.6899 12.428163
#> 23 75101 92009 7777.9864 15.764324
#> 24 75101 92012 7708.8244 108.691009
#> 25 75101 92014 9249.7126 8.287077
#> 26 75101 92020 7559.7570 19.252633
#> 27 75101 92024 5113.8197 26.692705
#> 28 75101 92025 9383.4950 19.986097
#> 29 75101 92026 7163.1816 144.794397
#> 30 75101 92032 8880.7653 10.126380
#> 31 75101 92035 8329.2367 5.971389
#> 32 75101 92036 8581.3876 23.492583
#> 33 75101 92040 6856.4454 67.487922
#> 34 75101 92044 5135.8208 105.543217
#> 35 75101 92046 5920.0446 16.164938
#> 36 75101 92049 5443.4834 21.685858
#> 37 75101 92051 5742.1941 98.483601
#> 38 75101 92062 7575.3666 165.913286
#> 39 75101 92073 8595.9356 20.755474
#> 40 75101 92075 5804.3871 5.833138
#> 41 75101 92078 8332.3849 0.000000
#> 42 75101 93001 6617.1492 26.613996
#> 43 75101 93006 6375.8895 7.027788
#> 44 75101 93008 9095.3599 17.442278
#> 45 75101 93013 10545.5676 2.477162
#> 46 75101 93027 9069.7148 3.136324
#> 47 75101 93029 10412.1524 0.000000
#> 48 75101 93039 8352.0973 0.000000
#> 49 75101 93045 6488.0041 4.797271
#> 50 75101 93048 8211.0625 84.570051
#> [ reached 'max' / getOption("max.print") -- omitted 4991 rows ]
The spflow package builds on the idea that flows correspond to pairwise interactions between the nodes of an origin network with the nodes of a destination network.
In our example, the origin and destination networks are the same because every municipality is both an origin and destination of a flow.
To estimate the model efficiently, the spflow package uses moment-based estimation methods, that exploit the relational structure of flow data. This avoids duplication arising from the fact that each municipality is at the origin and destination of many flows.
To describe the nodes of a network the package provides sp_network_nodes-class
that combines attributes of the nodes with the chosen network structure. For our model we choose the contiguity based neighborhood structure.
<-
paris10km_net sp_network_nodes(
network_id = "paris",
node_neighborhood = nb2mat(paris10km_nb$by_contiguity),
node_data = st_drop_geometry(paris10km_municipalities),
node_key_column = "ID_MUN")
paris10km_net#> Spatial network nodes with id: paris
#> --------------------------------------------------
#> Number of nodes: 71
#> Average number of links per node: 5.239
#> Density of the neighborhood matrix: 7.38% (non-zero connections)
#>
#> Data on nodes:
#> ID_MUN POPULATION MED_INCOME NB_COMPANY AREA
#> 1 75101 17100 31842.56 14333 182
#> 2 75102 22390 30024.50 14478 99
#> 3 75103 35991 30988.00 10696 117
#> 4 75104 27769 30514.67 7412 160
#> 5 75105 60179 32950.00 10290 252
#> 6 75106 43224 38447.69 10620 215
#> 7 75107 57092 41949.00 12602 412
#> 8 75108 38749 39774.00 52237 386
#> 9 75109 59474 32771.00 23687 218
#> 10 75110 94474 25154.00 23996 288
#> 11 75111 155006 26253.00 27481 366
#> 12 75112 144925 26729.00 19434 1637
#> 13 75113 182386 23538.18 16202 714
#> 14 75114 141102 27233.00 16066 561
#> 15 75115 238190 30227.33 25897 846
#> 16 75116 167613 38299.00 34510 1641
#> 17 75117 170156 29872.00 32182 564
#> 18 75118 201374 20942.00 23839 604
#> 19 75119 186116 19136.96 17481 675
#> 20 75120 197311 20632.00 23780 597
#> 21 92004 83845 23775.00 6562 482
#> 22 92007 38398 18293.38 1831 418
#> 23 92009 28709 28105.00 1820 193
#> 24 92012 117126 31267.78 14965 615
#> 25 92014 19872 31040.43 1337 186
#> 26 92020 34960 27258.00 1867 293
#> 27 92024 59240 18424.44 4977 308
#> 28 92025 85357 21158.00 4952 779
#> 29 92026 86854 28932.00 8065 417
#> 30 92032 22866 25036.00 1086 253
#> 31 92035 28371 29001.67 2179 178
#> 32 92036 42919 16465.76 2570 1164
#> 33 92040 65322 30259.20 5339 425
#> 34 92044 64654 30504.00 7457 242
#> 35 92046 30420 23041.00 2261 207
#> 36 92049 48909 26831.71 3647 207
#> 37 92051 62021 43350.00 10208 372
#> 38 92062 44514 26401.33 5303 318
#> 39 92073 47263 28212.00 3975 380
#> 40 92075 27367 27710.00 1599 156
#> [ reached 'max' / getOption("max.print") -- omitted 31 rows ]
The sp_network_pair-class
contains all information on the pairs of nodes belonging to the origin and destination networks.
<- sp_network_pair(
paris10km_net_pairs orig_net_id = "paris",
dest_net_id = "paris",
pair_data = paris10km_commuteflows,
orig_key_column = "ID_ORIG",
dest_key_column = "ID_DEST")
paris10km_net_pairs#> Spatial network pair with id: paris_paris
#> --------------------------------------------------
#> Origin network id: paris (with 71 nodes)
#> Destination network id: paris (with 71 nodes)
#> Number of pairs: 5041
#> Completeness of pairs: 100.00% (5041/5041)
#>
#> Data on node-pairs:
#> ID_ORIG ID_DEST DISTANCE COMMUTE_FLOW
#> 1 75101 75101 0.0000 3771.235555
#> 2 75101 75102 787.6493 294.768992
#> 3 75101 75103 1734.3136 71.251159
#> 4 75101 75104 1811.4124 99.384682
#> 5 75101 75105 2268.2408 98.889146
#> 6 75101 75106 1513.1158 65.154062
#> 7 75101 75107 1908.6102 96.991438
#> 8 75101 75108 2083.9014 455.666096
#> 9 75101 75109 1610.6894 265.184461
#> 10 75101 75110 2339.9467 74.846270
#> 11 75101 75111 3241.7862 93.954966
#> 12 75101 75112 6939.4271 123.460322
#> 13 75101 75113 4262.6282 130.726524
#> 14 75101 75114 3771.3545 74.777413
#> 15 75101 75115 4061.6599 122.035858
#> 16 75101 75116 5463.6893 126.298089
#> 17 75101 75117 3509.1773 95.305200
#> 18 75101 75118 3437.6852 47.283055
#> 19 75101 75119 4485.0897 46.148335
#> 20 75101 75120 4763.9492 25.819257
#> 21 75101 92004 6908.1954 18.840256
#> 22 75101 92007 7357.6899 12.428163
#> 23 75101 92009 7777.9864 15.764324
#> 24 75101 92012 7708.8244 108.691009
#> 25 75101 92014 9249.7126 8.287077
#> 26 75101 92020 7559.7570 19.252633
#> 27 75101 92024 5113.8197 26.692705
#> 28 75101 92025 9383.4950 19.986097
#> 29 75101 92026 7163.1816 144.794397
#> 30 75101 92032 8880.7653 10.126380
#> 31 75101 92035 8329.2367 5.971389
#> 32 75101 92036 8581.3876 23.492583
#> 33 75101 92040 6856.4454 67.487922
#> 34 75101 92044 5135.8208 105.543217
#> 35 75101 92046 5920.0446 16.164938
#> 36 75101 92049 5443.4834 21.685858
#> 37 75101 92051 5742.1941 98.483601
#> 38 75101 92062 7575.3666 165.913286
#> 39 75101 92073 8595.9356 20.755474
#> 40 75101 92075 5804.3871 5.833138
#> 41 75101 92078 8332.3849 0.000000
#> 42 75101 93001 6617.1492 26.613996
#> 43 75101 93006 6375.8895 7.027788
#> 44 75101 93008 9095.3599 17.442278
#> 45 75101 93013 10545.5676 2.477162
#> 46 75101 93027 9069.7148 3.136324
#> 47 75101 93029 10412.1524 0.000000
#> 48 75101 93039 8352.0973 0.000000
#> 49 75101 93045 6488.0041 4.797271
#> 50 75101 93048 8211.0625 84.570051
#> [ reached 'max' / getOption("max.print") -- omitted 4991 rows ]
The sp_multi_network-class
combines information on the nodes and the node-pairs and also ensures that both data sources are consistent. For example, if some of the origins in the sp_network_pair-class
are not identified with the nodes in the sp_network_nodes-class
an error will be raised.
<- sp_multi_network(paris10km_net,paris10km_net_pairs)
paris10km_multi_net
paris10km_multi_net#> Collection of spatial network nodes and pairs
#> --------------------------------------------------
#> Contains 1 sets of spatial network nodes.
#> With ids: paris
#> Contains 1 sets of spatial network pairs
#> With ids: paris_paris
#>
#> Availability of origin-destination pair information:
#> ID_NET_PAIR NPAIRS COMPLETENESS ID_ORIG_NET ORIG_NNODES ID_DEST_NET
#> 1 paris_paris 5041 100.00% paris 71 paris
#> DEST_NNODES
#> 1 71
The core function of the package is spflow()
, which provides an interface to four different estimators of the spatial econometric interaction model.
Estimation with default settings requires two arguments: an sp_multi_network-class
and a flow_formula
. The flow_formula
specifies the model we want to estimate. In this example, the dependent variable is a transformation of commuting flows and we use the dot shortcut to indicate that all available variables should be included in the model. Using the defaults leads to the most comprehensive spatial interaction model, which includes spatial lags of the dependent variable, the exogenous variables and additional attributes for intra-regional observations.
<- spflow(
results_default flow_formula = log(1 + COMMUTE_FLOW) ~ . + G_(log( 1 + DISTANCE)),
sp_multi_network = paris10km_multi_net)
results_default#> --------------------------------------------------
#> Spatial interaction model estimated by: MLE
#> Autocorrelation structure: model_9 (SDM)
#> Observations: 5041
#>
#> --------------------------------------------------
#> Coefficients:
#> est sd t.stat p.value
#> rho_d 0.44 0.02 28.06 0.01
#> rho_o 0.74 0.01 80.74 0.00
#> rho_w -0.35 0.02 -16.86 0.02
#> (Intercept) 1.27 0.23 5.50 0.06
#> (Intra) 3.11 0.47 6.62 0.05
#> DEST_AREA 0.00 0.00 8.04 0.04
#> DEST_MED_INCOME 0.00 0.00 -3.88 0.08
#> DEST_NB_COMPANY 0.00 0.00 3.43 0.09
#> DEST_POPULATION 0.00 0.00 4.67 0.07
#> DEST_AREA.lag1 0.00 0.00 -4.66 0.07
#> DEST_MED_INCOME.lag1 0.00 0.00 7.08 0.04
#> DEST_NB_COMPANY.lag1 0.00 0.00 2.95 0.10
#> DEST_POPULATION.lag1 0.00 0.00 -0.78 0.29
#> ORIG_AREA 0.00 0.00 6.23 0.05
#> ORIG_MED_INCOME 0.00 0.00 -0.24 0.42
#> ORIG_NB_COMPANY 0.00 0.00 -5.27 0.06
#> ORIG_POPULATION 0.00 0.00 22.58 0.01
#> ORIG_AREA.lag1 0.00 0.00 -3.94 0.08
#> ORIG_MED_INCOME.lag1 0.00 0.00 0.08 0.48
#> ORIG_NB_COMPANY.lag1 0.00 0.00 0.39 0.38
#> ORIG_POPULATION.lag1 0.00 0.00 -4.95 0.06
#> INTRA_AREA 0.00 0.00 -2.05 0.14
#> INTRA_MED_INCOME 0.00 0.00 0.68 0.31
#> INTRA_NB_COMPANY 0.00 0.00 -0.31 0.40
#> INTRA_POPULATION 0.00 0.00 -0.98 0.25
#> INTRA_AREA.lag1 0.00 0.00 0.90 0.27
#> INTRA_MED_INCOME.lag1 0.00 0.00 -2.36 0.13
#> INTRA_NB_COMPANY.lag1 0.00 0.00 2.29 0.13
#> INTRA_POPULATION.lag1 0.00 0.00 -1.33 0.21
#> log(1 + DISTANCE) -0.14 0.02 -6.01 0.05
#>
#> --------------------------------------------------
#> R2_corr: 0.9110023
We can adjust how the exogenous variables are to be used by wrapping them into the D_()
, O_()
, I_()
and G_()
functions. The variables in G_()
are used as OD pair features and those in D_()
, O_()
and I_()
are used as destination, origin and intra-regional features. We can take advantage of the formula interface to specify transformations and expand factor variables to dummies.
<- function(x) {
clog <- log(x)
log_x - mean(log_x)
log_x
}
<-
flow_formula log(COMMUTE_FLOW + 1) ~
D_(log(NB_COMPANY) + clog(MED_INCOME)) +
O_(log(POPULATION) + clog(MED_INCOME)) +
I_(log(POPULATION)) +
G_(log(DISTANCE + 1))
<- spflow(
results_mle
flow_formula,
paris10km_multi_net)
results_mle#> --------------------------------------------------
#> Spatial interaction model estimated by: MLE
#> Autocorrelation structure: model_9 (SDM)
#> Observations: 5041
#>
#> --------------------------------------------------
#> Coefficients:
#> est sd t.stat p.value
#> rho_d 0.21 0.02 10.98 0.03
#> rho_o 0.67 0.01 60.66 0.01
#> rho_w -0.02 0.02 -0.85 0.28
#> (Intercept) -0.80 0.29 -2.73 0.11
#> (Intra) 3.36 1.79 1.88 0.16
#> DEST_clog(MED_INCOME) -0.42 0.05 -8.16 0.04
#> DEST_log(NB_COMPANY) 0.35 0.01 23.45 0.01
#> DEST_clog(MED_INCOME).lag1 0.66 0.07 9.07 0.03
#> DEST_log(NB_COMPANY).lag1 -0.24 0.02 -11.44 0.03
#> ORIG_clog(MED_INCOME) -0.09 0.05 -1.80 0.16
#> ORIG_log(POPULATION) 0.76 0.02 35.79 0.01
#> ORIG_clog(MED_INCOME).lag1 -0.03 0.07 -0.47 0.36
#> ORIG_log(POPULATION).lag1 -0.60 0.03 -19.24 0.02
#> INTRA_log(POPULATION) -0.50 0.09 -5.83 0.05
#> INTRA_log(POPULATION).lag1 0.34 0.16 2.08 0.14
#> log(DISTANCE + 1) -0.14 0.02 -6.80 0.05
#>
#> --------------------------------------------------
#> R2_corr: 0.9192029
spflow_control
More fine-grained adjustments are possible via the spflow_control
argument. Here we change the estimation method and the way we want to model the spatial autoregression in the flows. To use spatial lags only for certain variables, we need to specify them as a second formula.
<- ~
sdm_formula O_(log(POPULATION) + clog(MED_INCOME)) +
D_(log(NB_COMPANY) + clog(MED_INCOME))
<- spflow_control(
cntrl estimation_method = "mcmc",
sdm_variables = sdm_formula,
model = "model_7") # restricts \rho_w = 0
<- spflow(
results_mcmc
flow_formula,
paris10km_multi_net,flow_control = cntrl)
results_mcmc#> --------------------------------------------------
#> Spatial interaction model estimated by: MCMC
#> Autocorrelation structure: model_7 (SDM)
#> Observations: 5041
#>
#> --------------------------------------------------
#> Coefficients:
#> est quant_025 quant_975 sd t.stat p.value
#> rho_d 0.20 0.17 0.23 0.01 13.38 0.02
#> rho_o 0.66 0.64 0.68 0.02 44.17 0.01
#> (Intercept) -0.83 -1.40 -0.27 0.29 -2.92 0.10
#> (Intra) 6.61 4.82 8.31 0.90 7.33 0.04
#> DEST_clog(MED_INCOME) -0.43 -0.53 -0.32 0.05 -7.96 0.04
#> DEST_log(NB_COMPANY) 0.35 0.32 0.38 0.02 18.67 0.02
#> DEST_clog(MED_INCOME).lag1 0.66 0.51 0.80 0.08 8.50 0.04
#> DEST_log(NB_COMPANY).lag1 -0.24 -0.28 -0.21 0.02 -12.12 0.03
#> ORIG_clog(MED_INCOME) -0.09 -0.19 0.01 0.05 -1.82 0.16
#> ORIG_log(POPULATION) 0.78 0.74 0.81 0.02 38.05 0.01
#> ORIG_clog(MED_INCOME).lag1 -0.03 -0.16 0.10 0.07 -0.43 0.37
#> ORIG_log(POPULATION).lag1 -0.61 -0.66 -0.56 0.03 -23.02 0.01
#> INTRA_log(POPULATION) -0.45 -0.61 -0.29 0.08 -5.52 0.06
#> log(DISTANCE + 1) -0.14 -0.18 -0.10 0.02 -6.19 0.05
#>
#> --------------------------------------------------
#> R2_corr: 0.9189524