Python script for comparing "zonal statistics" implementations/algorithms

zonalStatBenchmark.py 11KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287
  1. #!/usr/bin/python
  2. # -*- coding: utf-8 -*-
  3. import subprocess
  4. import sys
  5. import os
  6. from collections import defaultdict
  7. import re
  8. import time
  9. class zonalBenchmark:
  10. '''
  11. The script will provide a straight forward way to compare
  12. different zonal statistic implementations.
  13. '''
  14. def __init__(self):
  15. '''
  16. Current state of implementation:
  17. 1. oft-stat
  18. 2. pktools
  19. 3. GRASS
  20. 4. SAGA
  21. ( ^ those are implemented)
  22. 5. QGIS
  23. 6. rasterstats
  24. 7. starspan
  25. 8. R
  26. 9. R - multicore
  27. 10. PostGIS
  28. 11. GeoTools
  29. ( ^ those not yet)
  30. '''
  31. def checkInstallStatus():
  32. '''
  33. Check if the selected provider(s) are available on the
  34. (linux) machine.
  35. '''
  36. def getArguments(self, args):
  37. '''
  38. Get the links to the data which has been provided by the CLI
  39. and hand it to the execution. Checks also if the data exists.
  40. '''
  41. # show help for no arguments
  42. if len(args[0]) < 2:
  43. zonalBenchmark().showUsage()
  44. sys.exit()
  45. try:
  46. if args[0][1] == '-h' or args[0][1] == '--help':
  47. zonalBenchmark().showUsage()
  48. sys.exit()
  49. except IndexError:
  50. zonalBenchmark().showUsage()
  51. sys.exit()
  52. # get arguments for tools, split them, convert to int and sort
  53. try:
  54. toolsArgs = args[0][1].split('-')
  55. tools = sorted(map(int, toolsArgs), key=int)
  56. except (TypeError, ValueError) as e:
  57. zonalBenchmark(args).showUsage()
  58. sys.exit()
  59. # dict of implemented tools
  60. availableTools = {1: 'Open Foris Tools',
  61. 2: 'pktools',
  62. 3: 'GRASS',
  63. 4: 'SAGA'}
  64. selectedTools = {}
  65. # create dict for selected tools
  66. for item in tools:
  67. if item in availableTools:
  68. selectedTools.update({item: availableTools[item]})
  69. # get input file
  70. workingDir = os.path.dirname(os.path.realpath(__file__))
  71. try:
  72. inputArg = args[0][2]
  73. inputFile = os.path.join(workingDir, inputArg)
  74. if not os.path.isfile(inputFile):
  75. print 'ERROR: Input file does not exist\You can run \'{0} --help\' for usage of the program.\n'.format(str(args[0][0]).lstrip('./'))
  76. sys.exit()
  77. except (IndexError, ValueError) as e:
  78. zonalBenchmark().showUsage()
  79. # get mask file
  80. try:
  81. maskArg = args[0][3]
  82. maskFile = os.path.join(workingDir, maskArg)
  83. if not os.path.isfile(maskFile):
  84. print 'ERROR: Mask file does not exist\nYou can run \'{0} --help\' for usage of the program.\n'.format(str(args[0][0]).lstrip('./'))
  85. sys.exit()
  86. except (IndexError, ValueError) as e:
  87. zonalBenchmark().showUsage()
  88. # get number of repetitions
  89. try:
  90. repArg = args[0][4]
  91. except (IndexError, ValueError) as e:
  92. zonalBenchmark().showUsage()
  93. return selectedTools, inputFile, maskFile, repArg
  94. def setCommands(self, selectedTools, inputFile, maskFile):
  95. '''
  96. Constructs the actual command(s) for the selected providers.
  97. '''
  98. # create a raster from the shape
  99. maskFileName = os.path.basename(maskFile)
  100. maskTifName = os.path.dirname(maskFile) + '/' + maskFileName[:-4] + ".tif"
  101. maskShpName = os.path.dirname(maskFile) + '/' + maskFileName[:-4] + ".shp"
  102. f = open('bash_rasterize.sh', 'w')
  103. f.write('''resX=$(pkinfo -i "{0}" -ns | cut -d ' ' -f 2);echo $resX; resY=$(pkinfo -i "{0}" -nl | cut -d ' ' -f 2);echo $resY;xmin=$(pkinfo -i "{0}" -ulx | cut -d '=' -f 2);echo $xmin;xmax=$(pkinfo -i "{0}" -lrx | cut -d '=' -f 2);echo $xmax;ymax=$(pkinfo -i "{0}" -uly | cut -d '=' -f 2);echo $ymax;ymin=$(pkinfo -i "{0}" -lry | cut -d '=' -f 2);echo $ymin;gdal_rasterize -ts $resX $resY -te $xmin $ymin $xmax $ymax -a_nodata 0 -ot Byte -burn 1 -l "{2}" "{1}" "{3}"'''.format(inputFile, maskShpName, maskFileName[:-4], maskTifName))
  104. subprocess.call('chmod +x ./bash_rasterize.sh', shell=True, stderr=subprocess.STDOUT)
  105. selectedCommands = {}
  106. if 1 in selectedTools:
  107. maskFileTif = maskFile.replace('shp', 'tif')
  108. oftCMD = 'oft-stat -i "{0}" -o /tmp/zonal_oft_out.txt -um "{1}"'.format(inputFile, maskFileTif)
  109. selectedCommands.update({1: oftCMD})
  110. if 2 in selectedTools:
  111. #~ maskFileShp = maskFile.replace('tif','shp')
  112. pkCMD = 'pkextract -f \'ESRI Shapefile\' -s "{0}" -i "{1}" -o /tmp/zonal_pkt_out.shp -polygon --rule mean'.format(maskFile, inputFile)
  113. selectedCommands.update({2: pkCMD})
  114. if 3 in selectedTools:
  115. s = open('bash_grass_set_cmd.sh', 'w')
  116. s.write('''resX=$(pkinfo -i "{0}" -ns | cut -d ' ' -f 2);resY=$(pkinfo -i "{0}" -nl | cut -d ' ' -f 2);xmin=$(pkinfo -i "{0}" -ulx | cut -d '=' -f 2);xmin_abs=${{xmin#-}};xmax=$(pkinfo -i "{0}" -lrx | cut -d '=' -f 2);ymax=$(pkinfo -i "{0}" -uly | cut -d '=' -f 2);ymin=$(pkinfo -i "{0}" -lry | cut -d '=' -f 2);ymin_abs=${{ymin#-}}
  117. INDIR="{1}"
  118. g.region n=$ymax e=$xmax s=$ymin w=$xmin rows=$resY cols=$resX
  119. r.in.gdal input={0} output=tmp_raster --overwrite
  120. r.in.gdal input={2} output=tmp_zones --overwrite
  121. r.univar -t map=tmp_raster zones=tmp_zones separator=comma output=/tmp/zonal_grass_out.csv --overwrite'''.format(inputFile, os.path.dirname(maskFile), maskTifName))
  122. t = open('bash_grass_exec_cmd.sh', 'w')
  123. t.write('''INDIR="{0}"
  124. epsg=$(pkinfo -i "{1}" -a_srs | tail -c -18 | cut -d '"' -f 4)
  125. grass72 -c epsg:$epsg "$INDIR"/grassdata/templocation -e
  126. export GRASS_BATCH_JOB="$INDIR/bash_grass_set_cmd.sh"
  127. export GRASS_VERBOSE=0
  128. grass72 "$INDIR"/grassdata/templocation/PERMANENT
  129. rm -rf "$INDIR"/grassdata/templocation'''.format(os.path.dirname(os.path.realpath(__file__)), inputFile))
  130. grassCMD = '/bin/bash -c "{0}/bash_grass_exec_cmd.sh > /dev/null 2>&1"'.format(os.path.dirname(os.path.realpath(__file__)))
  131. subprocess.call('chmod +x ./bash_grass_exec_cmd.sh', shell=True, stderr=subprocess.STDOUT)
  132. subprocess.call('chmod +x ./bash_grass_set_cmd.sh', shell=True, stderr=subprocess.STDOUT)
  133. selectedCommands.update({3: grassCMD})
  134. if 4 in selectedTools:
  135. maskFileTif = maskFile.replace('shp', 'tif')
  136. sagaCMD = 'saga_cmd statistics_grid 5 -ZONES "{0}" -STATLIST "{1}" -OUTTAB /tmp/zonal_saga_out.txt'.format(maskFileTif, inputFile)
  137. selectedCommands.update({4: sagaCMD})
  138. return selectedCommands
  139. def executeCommands(self, commands, repetitions):
  140. '''
  141. Executes the command for the selected provider.
  142. '''
  143. timing = {}
  144. secondsAvg = []
  145. cpuAvg = []
  146. # run the rasterization if neccessary
  147. rastRuns = 0
  148. for key in commands.items():
  149. if ((key[0] == 1 or key[0] == 3 or key[0] == 4) and rastRuns == 0):
  150. try:
  151. rasterize = subprocess.check_output('/bin/bash -c ./bash_rasterize.sh', shell=True, stderr=subprocess.STDOUT)
  152. rastRuns += 1
  153. except OSError:
  154. print "Bash file for rasterization not found."
  155. sys.exit()
  156. for key, value in commands.items():
  157. reps = 0
  158. for i in range(0, int(repetitions)):
  159. try:
  160. output = subprocess.check_output(
  161. '/usr/bin/time -f "seconds: %e \ncpu-load: %P" {0} | tail -2'.format(value),
  162. shell=True,
  163. stderr=subprocess.STDOUT
  164. )
  165. # play nice with programs which do not handle fast calls to well
  166. time.sleep(0.1)
  167. except subprocess.CalledProcessError as grepexc:
  168. print "error code", grepexc.returncode, grepexc.output
  169. # grep seconds and cpu-load from the string
  170. seconds = float((re.split(":|\n", output[0:28].replace(" ", "")))[1])
  171. cpuload = int(((re.split(":|\n", output[0:28].replace(" ", "")))[3]).replace("%", ""))
  172. # set output to 0 to prevent any 'misappropriation' of older outputs
  173. output = "sec: 0 \n cpu:0"
  174. # increment reps and fill lists for timing & cpu load
  175. secondsAvg.append(seconds)
  176. cpuAvg.append(cpuload)
  177. reps += 1
  178. # update the dict with the average timing and cpuload, as well as a repetition counter
  179. timing.update({key: [round(sum(secondsAvg) / len(secondsAvg), 3), sum(cpuAvg) / len(cpuAvg), reps]})
  180. # clear the lists for the next values
  181. secondsAvg[:] = []
  182. cpuAvg[:] = []
  183. # print timing
  184. # sys.exit()
  185. return timing
  186. def showOutput(self, output, selectedTools):
  187. '''
  188. Construct a pretty table to show a comparison of the programs
  189. on the command line.
  190. '''
  191. # merge the names of the selected tools with the numerical representation
  192. # save as a new dict
  193. dd = defaultdict(list)
  194. for d in (selectedTools, output):
  195. for key, value in d.iteritems():
  196. dd[key].extend([value])
  197. out = ''
  198. out += "All selected tools ran successfully.\n\n"
  199. # print with underscore
  200. out += "\033[4m{:<8} {:<17} {:<15} {:<15} {:<15}\033[0m\n".format('ID', 'Tool', 'seconds (avg)', 'CPU_load (avg)', 'No. of runs')
  201. for k, v in dd.iteritems():
  202. out += "{:<8} {:<17} {:<15} {:<15} {:<15}\n".format(k, dd[k][0], dd[k][1][0], dd[k][1][1], dd[k][1][2])
  203. print out
  204. # return out
  205. def showUsage(self):
  206. # print zonalBenchmark.__init__.__doc__
  207. print u'''Usage:
  208. python zonalStatBenchmark [tools] [input raster] [input mask / shape] [number of runs]
  209. [tools] expects a chain of numbers, seperated by a hyphen;
  210. e.g. 1-2-3 for selecting oft-stat, pktools and GRASS
  211. [input raster] the raster file for input
  212. [input mask / shape] the mask shape; expects a vector file which will be
  213. automatically rasterized if the selected toos require it
  214. [number of runs] the number of runs / repetitions per tool, e.g. "5" means
  215. every tool will run 5 times and the average of the runs will
  216. be calculated
  217. All arguments are mandatory.
  218. The following tools are available:
  219. 1. oft-stat
  220. 2. pktools
  221. 3. GRASS
  222. 4. SAGA
  223. '''
  224. # userArgs[0] = selected providers
  225. # userArgs[1] = input TIF
  226. # userArgs[2] = masking shape
  227. # userArgs[3] = number of repetitions
  228. args = []
  229. args.append(sys.argv)
  230. userArgs = zonalBenchmark().getArguments(args)
  231. commands = zonalBenchmark().setCommands(userArgs[0], userArgs[1], userArgs[2])
  232. timeOutput = zonalBenchmark().executeCommands(commands, userArgs[3])
  233. zonalBenchmark().showOutput(timeOutput, userArgs[0])