demo/modules/statstools.py
author Uche Ogbuji <uche@ogbuji.net>
Mon Sep 21 22:08:46 2009 -0600 (4 months ago)
changeset 129 e8ca24394b6d
parent 58 c004cb284645
permissions -rw-r--r--
More ditching of xml_print
     1 # -*- encoding: utf-8 -*-
     2 '''
     3 See also:
     4 '''
     5 
     6 from __future__ import with_statement
     7 import sys, time
     8 import urllib, urlparse
     9 import tempfile
    10 import os
    11 import re
    12 import csv
    13 import cgi
    14 from cStringIO import StringIO
    15 from gettext import gettext as _
    16 from itertools import *
    17 from functools import *
    18 from subprocess import *
    19 
    20 import amara
    21 from amara.xslt import transform
    22 from amara.xpath.util import simplify
    23 from amara.bindery import html
    24 from amara.lib.util import *
    25 
    26 from akara.services import simple_service, response
    27 
    28 VAR_PAT = re.compile('VARIABLE\s+LABELS\s+(((\w+)\s+"([^"]+)"\s*)+)\.')
    29 VAR_DEF_PAT = re.compile('(\w+)\s+"([^"]+)"')
    30 
    31 VALUE_PAT = re.compile('VALUE\s+LABELS\s+((/(\w+)\s+(\'(\w+)\'\s+"([^"]+)"\s*)+)+)\.')
    32 VALUE_DEF_SET_PAT = re.compile('/(\w+)\s+((\'(\w+)\'\s+"([^"]+)"\s*)+)')
    33 VALUE_DEF_PAT = re.compile('\'(\w+)\'\s+"([^"]+)"')
    34 
    35 VALUE_SET_TYPE = 'value_set'
    36 VARIABLE_LABELS_TYPE = 'variable_labels'
    37 VALUE_LABELS_TYPE = 'value_labels'
    38 
    39 #R_SCRIPT = '''library(foreign)
    40 #mydata <- read.spss(file='%s')
    41 #write.csv2(mydata)
    42 #'''
    43 
    44 R_SCRIPT = '''library(Hmisc)
    45 mydata <- spss.get(file='%s')
    46 write.csv2(mydata)
    47 '''
    48 
    49 try:
    50     R_FILE_CMD = AKARA_MODULE_CONFIG.get('r_command', 'r')
    51 except NameError:
    52     #Not running from Akara
    53     R_FILE_CMD = 'r'
    54 
    55 POR_REQUIRED = _("The 'POR' POST parameter is mandatory.")
    56 
    57 SERVICE_ID = 'http://purl.org/akara/services/builtin/spss.json'
    58 @simple_service('POST', SERVICE_ID, 'spss.json', 'application/json')
    59 def spss2json(body, ctype, **params):
    60     '''
    61     Uses GNU R to convert SPSS to JSON
    62     Optionally tries to guess long labels from an original .SPS file
    63     
    64     Requires POST body of multipart/form-data
    65     
    66     Sample request:
    67     curl -F "POR=@foo.por" http://localhost:8880/spss.json
    68     curl -F "POR=@foo.por" -F "SPSS=@foo.sps" http://localhost:8880/spss.json
    69     '''
    70     #curl --request POST -F "POR=@lat506.por" -F "SPSS=@LAT506.SPS" http://labs.zepheira.com:8880/spss.json
    71     
    72     #Useful:
    73     # * [[http://wiki.math.yorku.ca/index.php/R:_Data_conversion_from_SPSS|R: Data conversion from SPSS]]
    74 
    75     import simplejson
    76 
    77     body = StringIO(body)
    78     form = cgi.FieldStorage(fp=body, environ=WSGI_ENVIRON)
    79     #for k in form:
    80     #    print >> sys.stderr, (k, form[k][:100])
    81     por = form.getvalue('POR')
    82     assert_not_equal(por, None, msg=POR_REQUIRED)
    83     spss = form.getvalue('SPSS')
    84     
    85     (items, varlabels, valuelabels) = parse_spss(por, spss)
    86 
    87     for count, item in enumerate(items):
    88         #print >> sys.stderr, row
    89         item['id'] = item['label'] = '_' + str(count)
    90         item['type'] = VALUE_SET_TYPE
    91 
    92     return simplejson.dumps({'items': items, VARIABLE_LABELS_TYPE: varlabels, VALUE_LABELS_TYPE: valuelabels}, indent=4)
    93 
    94 
    95 def parse_spss(spss_por, spss_syntax=None):
    96     '''
    97     Uses GNU R to convert SPSS to a simple Python data structure
    98     Optionally tries to guess long labels from an original .SPS file
    99     '''
   100     varlabels = {}
   101     valuelabels = {}
   102     if spss_syntax:
   103         matchinfo = VAR_PAT.search(spss_syntax)
   104         if matchinfo:
   105             #print >> sys.stderr, matchinfo.groups
   106             defns = matchinfo.group(1)
   107             for defn in VAR_DEF_PAT.finditer(defns):
   108                 varlabels[defn.group(1)] = defn.group(2)
   109 
   110         matchinfo = VALUE_PAT.search(spss_syntax)
   111         defsets = matchinfo.group(1)
   112         for defset in VALUE_DEF_SET_PAT.finditer(defsets):
   113             valuelabelset = {}
   114             for defn in VALUE_DEF_PAT.finditer(defset.group(2)):
   115                 valuelabelset[defn.group(1)] = defn.group(2)
   116             valuelabels[defset.group(1)] = valuelabelset
   117 
   118     #print >> sys.stderr, varlabels
   119     #print >> sys.stderr, valuelabels
   120 
   121     #print >> sys.stderr, por[:100]
   122     #print >> sys.stderr, spss[:100]
   123     temp = tempfile.mkstemp(suffix=".por")
   124     os.write(temp[0], spss_por)
   125 
   126     cmdline = R_FILE_CMD
   127     process = Popen(cmdline, stdin=PIPE, stdout=PIPE, stderr=PIPE, universal_newlines=True, shell=True)
   128     
   129     csvdata, perr = process.communicate(input=R_SCRIPT%temp[1])
   130     os.close(temp[0])
   131     os.remove(temp[1])
   132     if not csvdata:
   133         print >> sys.stderr, R_SCRIPT%temp[1]
   134         print >> sys.stderr, perr
   135         #FIXME: L10N
   136         raise ValueError('Empty output from the command line.  Probably a failure.  Command line: "%s"'%cmdline)
   137 
   138     def value(k, v):
   139         if k in valuelabels and v in valuelabels[k]:
   140             return valuelabels[k][v]
   141         else:
   142             return v
   143 
   144     r_reader = csv.DictReader(csvdata.splitlines(), delimiter=';')
   145     rows = [
   146         dict(((k, value(k, v.strip())) for (k, v) in row.iteritems()))
   147         for row in r_reader
   148     ]
   149 
   150     return (rows, varlabels, valuelabels)
   151