Getting coordinates from multiple providers

geocode-vs.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316
  1. import requests
  2. import json
  3. import xml.etree.ElementTree as ET
  4. from numpy import array
  5. from sys import argv
  6. script, city_file = argv
  7. # All functions are pretty much the same; therefore comments (mostly) on the first
  8. def geocode_nominatim():
  9. #Load settings
  10. print "Nominatim geocoding ..."
  11. count_success = 0
  12. count_fail = 0
  13. api_data = json.load(open("API.json", "r"))
  14. return_result = {}
  15. with open(city_file, "r") as fp:
  16. for city in fp:
  17. try:
  18. api_data["Nominatim"]["payload"]["q"] = city # Set query to current city
  19. # send request with payload
  20. r = requests.get(api_data["Nominatim"]["service"]["url"], params=api_data["Nominatim"]["payload"])
  21. placesjson = r.json()
  22. ping = r.elapsed.microseconds / 1000 # get milliseconds
  23. result = placesjson[0] # get the first and (supposedly) best result
  24. return_result.update({city.strip("\n").replace(",",""): {"lat": result['lat'], "lon": result['lon'], "ping_ms": ping}})
  25. count_success += 1
  26. except KeyboardInterrupt:
  27. exit()
  28. except:
  29. count_fail += 1
  30. return_result.update({city.strip("\n").replace(",",""): {"lat": "NA", "lon": "NA", "ping_ms": ping}})
  31. print "%s out of %s cities were geocoded successfully (%d%%)." % (count_success,
  32. count_success + count_fail,
  33. float(count_success) / float(count_success + count_fail) * 100)
  34. return return_result
  35. def geocode_google():
  36. print "Google geocoding ..."
  37. count_success = 0
  38. count_fail = 0
  39. api_data = json.load(open("API.json", "r"))
  40. return_result = {}
  41. with open(city_file, "r") as fp:
  42. for city in fp:
  43. try:
  44. api_data["Google"]["payload"]["address"] = city
  45. r = requests.get(api_data["Google"]["service"]["url"], params=api_data["Google"]["payload"])
  46. placesjson = r.json()
  47. ping = r.elapsed.microseconds / 1000
  48. result = placesjson["results"][0]["geometry"]["location"]
  49. return_result.update({city.strip("\n").replace(",",""): {"lat": result['lat'], "lon": result['lng'], "ping_ms": ping}})
  50. count_success += 1
  51. except KeyboardInterrupt:
  52. exit()
  53. except:
  54. count_fail += 1
  55. return_result.update({city.strip("\n").replace(",",""): {"lat": "NA", "lon": "NA", "ping_ms": ping}})
  56. print "%s out of %s cities were geocoded successfully (%d%%)." % (count_success,
  57. count_success + count_fail,
  58. float(count_success) / float(count_success + count_fail) * 100)
  59. return return_result
  60. def geocode_here():
  61. print "HERE geocoding ..."
  62. count_success = 0
  63. count_fail = 0
  64. api_data = json.load(open("API.json", "r"))
  65. return_result = {}
  66. with open(city_file, "r") as fp:
  67. for city in fp:
  68. try:
  69. api_data["Here"]["payload"]["searchtext"] = city
  70. r = requests.get(api_data["Here"]["service"]["url"], params=api_data["Here"]["payload"])
  71. ping = r.elapsed.microseconds / 1000
  72. root = ET.fromstring(r.content)
  73. # [1] = first result; long version for readability;
  74. # short version: root[0][1][1][3][2][0].text
  75. lat = root.findtext("./Response/View/Result[1]/Location/DisplayPosition/Latitude")
  76. lon = root.findtext("./Response/View/Result[1]/Location/DisplayPosition/Longitude")
  77. ping = r.elapsed.microseconds / 1000
  78. if lat == None or lon == None:
  79. count_fail += 1
  80. return_result.update({city.strip("\n").replace(",",""): {"lat": "NA", "lon": "NA", "ping_ms": ping}})
  81. else:
  82. count_success += 1
  83. return_result.update({city.strip("\n").replace(",",""): {"lat": lat, "lon": lon, "ping_ms": ping}})
  84. except KeyboardInterrupt:
  85. exit()
  86. print "%s out of %s cities were geocoded successfully (%d%%)." % (count_success,
  87. count_success + count_fail,
  88. float(count_success) / float(count_success + count_fail) * 100)
  89. return return_result
  90. def geocode_mapquest():
  91. print "MapQuest geocoding ..."
  92. count_success = 0
  93. count_fail = 0
  94. api_data = json.load(open("API.json", "r"))
  95. return_result = {}
  96. with open(city_file, "r") as fp:
  97. for city in fp:
  98. try:
  99. api_data["MapQuest"]["payload"]["location"] = city
  100. r = requests.get(api_data["MapQuest"]["service"]["url"], params=api_data["MapQuest"]["payload"])
  101. placesjson = r.json()
  102. ping = r.elapsed.microseconds / 1000
  103. result = placesjson['results'][0]['locations'][0]['latLng']
  104. count_success += 1
  105. return_result.update({city.strip("\n").replace(",",""): {"lat": result['lat'], "lon": result['lng'], "ping_ms": ping}})
  106. except KeyboardInterrupt:
  107. exit()
  108. except:
  109. count_fail += 1
  110. return_result.update({city.strip("\n").replace(",",""): {"lat": "NA", "lon": "NA", "ping_ms": ping}})
  111. print "%s out of %s cities were geocoded successfully (%d%%)." % (count_success,
  112. count_success + count_fail,
  113. float(count_success) / float(count_success + count_fail) * 100)
  114. return return_result
  115. def geocode_opencage():
  116. print "OpenCage geocoding ..."
  117. count_success = 0
  118. count_fail = 0
  119. api_data = json.load(open("API.json", "r"))
  120. return_result = {}
  121. with open(city_file, "r") as fp:
  122. for city in fp:
  123. try:
  124. api_data["OpenCage"]["payload"]["q"] = city.replace(",","")
  125. r = requests.get(api_data["OpenCage"]["service"]["url"], params=api_data["OpenCage"]["payload"])
  126. placesjson = r.json()
  127. ping = r.elapsed.microseconds / 1000
  128. result = placesjson['results'][0]['geometry']
  129. count_success += 1
  130. return_result.update({city.strip("\n").replace(",",""): {"lat": result['lat'], "lon": result['lng'], "ping_ms": ping}})
  131. except KeyboardInterrupt:
  132. exit()
  133. except:
  134. count_fail += 1
  135. return_result.update({city.strip("\n").replace(",",""): {"lat": "NA", "lon": "NA", "ping_ms": ping}})
  136. print "%s out of %s cities were geocoded successfully (%d%%)." % (count_success,
  137. count_success + count_fail,
  138. float(count_success) / float(count_success + count_fail) * 100)
  139. return return_result
  140. def write_results():
  141. with open("cities.csv", "w+") as f:
  142. # Writing header
  143. f.write("City, "
  144. + "Nominatim_Lat, Nominatim_Lon, Nominatim_ping_ms, "
  145. + "Google_Lat, Google_Lon, Google_ping_ms, "
  146. + "MapQuest_Lat, MapQuest_Lon, MapQuest_ping_ms, "
  147. + "Here_Lat, Here_Lon, Here_ping_ms, "
  148. + "OpenCage_Lat, OpenCage_Lon, OpenCage_ping_ms"
  149. + "\n")
  150. # Write results
  151. for a, b in nom.iteritems():
  152. with open("cities.csv", "a+") as f:
  153. f.write(a + ", " + str(b["lat"]) + ", "
  154. + str(b["lon"]) + ", "
  155. + str(b["ping_ms"]) + ", "
  156. + str(ggl[a]["lat"]) + ", "
  157. + str(ggl[a]["lon"]) + ", "
  158. + str(ggl[a]["ping_ms"]) + ", "
  159. + str(mq[a]["lat"]) + ", "
  160. + str(mq[a]["lon"]) + ", "
  161. + str(mq[a]["ping_ms"]) + ", "
  162. + str(here[a]["lat"]) + ", "
  163. + str(here[a]["lon"]) + ", "
  164. + str(here[a]["ping_ms"]) + ", "
  165. + str(oc[a]["lat"]) + ", "
  166. + str(oc[a]["lon"]) + ", "
  167. + str(oc[a]["ping_ms"])
  168. + "\n")
  169. print "CSV output written to: cities.csv"
  170. def calc_stats():
  171. nom_outlier = 0
  172. ggl_outlier = 0
  173. mq_outlier = 0
  174. here_outlier = 0
  175. oc_outlier = 0
  176. outlier = []
  177. nom_ping = 0
  178. ggl_ping = 0
  179. mq_ping = 0
  180. here_ping = 0
  181. oc_ping = 0
  182. nom_fail = 0
  183. nom_success = 0
  184. ggl_fail = 0
  185. ggl_success = 0
  186. mq_fail = 0
  187. mq_success = 0
  188. here_fail = 0
  189. here_success = 0
  190. oc_fail = 0
  191. oc_success = 0
  192. iter_count = 1
  193. fail_count = 0
  194. for a, b in nom.iteritems():
  195. # Get response time for queries
  196. nom_ping += b["ping_ms"]
  197. ggl_ping += ggl[a]["ping_ms"]
  198. mq_ping += mq[a]["ping_ms"]
  199. here_ping += here[a]["ping_ms"]
  200. oc_ping += oc[a]["ping_ms"]
  201. iter_count += 1
  202. # Get success rate of geocoding per provider (w/o passing the value from the
  203. # function above...
  204. if b["lat"] == "NA":
  205. nom_fail += 1
  206. else:
  207. nom_success += 1
  208. if ggl[a]["lat"] == "NA":
  209. ggl_fail += 1
  210. else:
  211. ggl_success += 1
  212. if mq[a]["lat"] == "NA":
  213. mq_fail += 1
  214. else:
  215. mq_success += 1
  216. if here[a]["lat"] == "NA":
  217. here_fail += 1
  218. else:
  219. here_success += 1
  220. if oc[a]["lat"] == "NA":
  221. oc_fail += 1
  222. else:
  223. oc_success += 1
  224. # Try to calculate outliers
  225. try:
  226. coords_x = array((float(b["lat"]), float(ggl[a]["lat"]), float(mq[a]["lat"]),
  227. float(here[a]["lat"]), float(oc[a]["lat"])))
  228. coords_y = array((float(b["lon"]), float(ggl[a]["lon"]), float(mq[a]["lon"]),
  229. float(here[a]["lon"]), float(oc[a]["lon"])))
  230. # Calculate outliers, which is done by comparing a coordinate with the standard
  231. # deviation of the 5 providers. If the difference is higher than the std then
  232. # the point should be counted as outlier. Although this doesn't neccessarily mean
  233. # anything it may create hints to investige further. It could be a sign for a very
  234. # high or very poor quality of geocoding or some other issues (e.g. using the
  235. # first result of the response set).
  236. if (abs(coords_x.mean() - float(b["lat"])) > coords_x.std()
  237. and abs(coords_y.mean() - float(b["lat"])) > coords_y.std()):
  238. nom_outlier += 1
  239. outlier.append(str(a) + " (Nominatim)")
  240. elif (abs(coords_x.mean() - float(ggl[a]["lat"])) > coords_x.std()
  241. and abs(coords_y.mean() - float(ggl[a]["lat"])) > coords_y.std()):
  242. ggl_outlier += 1
  243. outlier.append(str(a) + " (Google)")
  244. elif (abs(coords_x.mean() - float(mq[a]["lat"])) > coords_x.std()
  245. and abs(coords_y.mean() - float(mq[a]["lat"])) > coords_y.std()):
  246. mq_outlier += 1
  247. outlier.append(str(a) + " (MapQuest)")
  248. elif (abs(coords_x.mean() - float(here[a]["lat"])) > coords_x.std()
  249. and abs(coords_y.mean() - float(here[a]["lat"])) > coords_y.std()):
  250. here_outlier += 1
  251. outlier.append(str(a) + " (HERE)")
  252. elif (abs(coords_x.mean() - float(oc[a]["lat"])) > coords_x.std()
  253. and abs(coords_y.mean() - float(oc[a]["lat"])) > coords_y.std()):
  254. oc_outlier += 1
  255. outlier.append(str(a) + " (OpenCage)")
  256. except ValueError:
  257. # Coordinates of one or more providers are missing ("NA"), therefore do not
  258. # calculate. This should make sure that at least some statistical common sense
  259. # is obeyed. But definitely open for discussion, whether it is a good idea.
  260. fail_count += 1
  261. print """
  262. -----------------------------------------------------------------------
  263. | | success rate / no. of outliers / ping (ms)
  264. |-----------|----------------------------------------------------------
  265. | Nominatim | %s%% / %s / %s
  266. | Google | %s%% / %s / %s
  267. | MapQuest | %s%% / %s / %s
  268. | HERE | %s%% / %s / %s
  269. | OpenCage | %s%% / %s / %s
  270. |----------------------------------------------------------------------
  271. | Processed %s datapoints, %s were used for outlier calculation (%s%%).
  272. -----------------------------------------------------------------------
  273. """ % (format(float(nom_success) / float(nom_fail + nom_success) * 100, '.2f'), nom_outlier,
  274. nom_ping / iter_count,
  275. format(float(ggl_success) / float(ggl_fail + ggl_success) * 100, '.2f'),
  276. ggl_outlier, ggl_ping / iter_count,
  277. format(float(mq_success) / float(mq_fail + mq_success) * 100, '.2f'),
  278. mq_outlier, mq_ping / iter_count,
  279. format(float(here_success) / float(here_fail + here_success) * 100, '.2f'),
  280. here_outlier, here_ping / iter_count,
  281. format(float(oc_success) / float(oc_fail + oc_success) * 100, '.2f'),
  282. oc_outlier, oc_ping / iter_count,
  283. iter_count, iter_count - fail_count, float(iter_count - fail_count) / float(iter_count) * 100)
  284. # Write outliers to file for further investigation
  285. with open("outliers.txt", "w+") as o:
  286. for item in outlier:
  287. o.write(item + "\n")
  288. print "Outliers written to: outliers.txt"
  289. if __name__ == "__main__":
  290. nom = geocode_nominatim()
  291. ggl = geocode_google()
  292. mq = geocode_mapquest()
  293. here = geocode_here()
  294. oc = geocode_opencage()
  295. calc_stats()
  296. write_results()