##// END OF EJS Templates
Refactor to prefer rpy2's pandas2ri conversion system.
Thomas Kluyver -
Show More
@@ -1,638 +1,660 b''
1 1 # -*- coding: utf-8 -*-
2 2 """
3 3 ======
4 4 Rmagic
5 5 ======
6 6
7 7 Magic command interface for interactive work with R via rpy2
8 8
9 9 Usage
10 10 =====
11 11
12 12 ``%R``
13 13
14 14 {R_DOC}
15 15
16 16 ``%Rpush``
17 17
18 18 {RPUSH_DOC}
19 19
20 20 ``%Rpull``
21 21
22 22 {RPULL_DOC}
23 23
24 24 ``%Rget``
25 25
26 26 {RGET_DOC}
27 27
28 28 """
29 29
30 30 #-----------------------------------------------------------------------------
31 31 # Copyright (C) 2012 The IPython Development Team
32 32 #
33 33 # Distributed under the terms of the BSD License. The full license is in
34 34 # the file COPYING, distributed as part of this software.
35 35 #-----------------------------------------------------------------------------
36 36
37 37 import sys
38 38 import tempfile
39 39 from glob import glob
40 40 from shutil import rmtree
41 41 from getopt import getopt
42 42
43 43 # numpy and rpy2 imports
44 44
45 45 import numpy as np
46 46
47 47 import rpy2.rinterface as ri
48 48 import rpy2.robjects as ro
49 from rpy2.robjects.numpy2ri import numpy2ri
50 ro.conversion.py2ri = numpy2ri
49 try:
50 from rpy2.robjects import pandas2ri
51 pandas2ri.activate()
52 except ImportError:
53 pandas2ri = None
54 from rpy2.robjects import numpy2ri
55 numpy2ri.activate()
51 56
52 57 # IPython imports
53 58
54 59 from IPython.core.displaypub import publish_display_data
55 60 from IPython.core.magic import (Magics, magics_class, cell_magic, line_magic,
56 61 line_cell_magic, needs_local_scope)
57 62 from IPython.testing.skipdoctest import skip_doctest
58 63 from IPython.core.magic_arguments import (
59 64 argument, magic_arguments, parse_argstring
60 65 )
66 from IPython.external.simplegeneric import generic
61 67 from IPython.utils.py3compat import str_to_unicode, unicode_to_str, PY3
62 68
63 69 class RInterpreterError(ri.RRuntimeError):
64 70 """An error when running R code in a %%R magic cell."""
65 71 def __init__(self, line, err, stdout):
66 72 self.line = line
67 73 self.err = err.rstrip()
68 74 self.stdout = stdout.rstrip()
69 75
70 76 def __unicode__(self):
71 77 s = 'Failed to parse and evaluate line %r.\nR error message: %r' % \
72 78 (self.line, self.err)
73 79 if self.stdout and (self.stdout != self.err):
74 80 s += '\nR stdout:\n' + self.stdout
75 81 return s
76 82
77 83 if PY3:
78 84 __str__ = __unicode__
79 85 else:
80 86 def __str__(self):
81 87 return unicode_to_str(unicode(self), 'utf-8')
82 88
83 89 def Rconverter(Robj, dataframe=False):
84 90 """
85 91 Convert an object in R's namespace to one suitable
86 92 for ipython's namespace.
87 93
88 94 For a data.frame, it tries to return a structured array.
89 95 It first checks for colnames, then names.
90 96 If all are NULL, it returns np.asarray(Robj), else
91 97 it tries to construct a recarray
92 98
93 99 Parameters
94 100 ----------
95 101
96 102 Robj: an R object returned from rpy2
97 103 """
98 104 is_data_frame = ro.r('is.data.frame')
99 105 colnames = ro.r('colnames')
100 106 rownames = ro.r('rownames') # with pandas, these could be used for the index
101 107 names = ro.r('names')
102 108
103 109 if dataframe:
104 110 as_data_frame = ro.r('as.data.frame')
105 111 cols = colnames(Robj)
106 112 _names = names(Robj)
107 113 if cols != ri.NULL:
108 114 Robj = as_data_frame(Robj)
109 115 names = tuple(np.array(cols))
110 116 elif _names != ri.NULL:
111 117 names = tuple(np.array(_names))
112 118 else: # failed to find names
113 119 return np.asarray(Robj)
114 120 Robj = np.rec.fromarrays(Robj, names = names)
115 121 return np.asarray(Robj)
116 122
123 @generic
117 124 def pyconverter(pyobj):
118 """Convert Python objects to R objects."""
119 if 'pandas' in sys.modules:
120 # We only do this if pandas is already loaded
125 """Convert Python objects to R objects. Add types using the decorator:
126
127 @pyconverter.when_type
128 """
129 return pyobj
130
131 # The default conversion for lists seems to make them a nested list. That has
132 # some advantages, but is rarely convenient, so for interactive use, we convert
133 # lists to a numpy array, which becomes an R vector.
134 @pyconverter.when_type(list)
135 def pyconverter_list(pyobj):
136 return np.asarray(pyobj)
137
138 if pandas2ri is None:
139 # pandas2ri was new in rpy2 2.3.3, so for now we'll fallback to pandas'
140 # conversion function.
141 try:
121 142 from pandas import DataFrame
122 if isinstance(pyobj, DataFrame):
123 143 from pandas.rpy.common import convert_to_r_dataframe
144 @pyconverter.when_type(DataFrame)
145 def pyconverter_dataframe(pyobj):
124 146 return convert_to_r_dataframe(pyobj, strings_as_factors=True)
125
126 return np.asarray(pyobj)
147 except ImportError:
148 pass
127 149
128 150 @magics_class
129 151 class RMagics(Magics):
130 152 """A set of magics useful for interactive work with R via rpy2.
131 153 """
132 154
133 155 def __init__(self, shell, Rconverter=Rconverter,
134 156 pyconverter=pyconverter,
135 157 cache_display_data=False):
136 158 """
137 159 Parameters
138 160 ----------
139 161
140 162 shell : IPython shell
141 163
142 164 Rconverter : callable
143 165 To be called on values taken from R before putting them in the
144 166 IPython namespace.
145 167
146 168 pyconverter : callable
147 169 To be called on values in ipython namespace before
148 170 assigning to variables in rpy2.
149 171
150 172 cache_display_data : bool
151 173 If True, the published results of the final call to R are
152 174 cached in the variable 'display_cache'.
153 175
154 176 """
155 177 super(RMagics, self).__init__(shell)
156 178 self.cache_display_data = cache_display_data
157 179
158 180 self.r = ro.R()
159 181
160 182 self.Rstdout_cache = []
161 183 self.pyconverter = pyconverter
162 184 self.Rconverter = Rconverter
163 185
164 186 def eval(self, line):
165 187 '''
166 188 Parse and evaluate a line with rpy2.
167 189 Returns the output to R's stdout() connection
168 190 and the value of eval(parse(line)).
169 191 '''
170 192 old_writeconsole = ri.get_writeconsole()
171 193 ri.set_writeconsole(self.write_console)
172 194 try:
173 195 value = ri.baseenv['eval'](ri.parse(line))
174 196 except (ri.RRuntimeError, ValueError) as exception:
175 197 warning_or_other_msg = self.flush() # otherwise next return seems to have copy of error
176 198 raise RInterpreterError(line, str_to_unicode(str(exception)), warning_or_other_msg)
177 199 text_output = self.flush()
178 200 ri.set_writeconsole(old_writeconsole)
179 201 return text_output, value
180 202
181 203 def write_console(self, output):
182 204 '''
183 205 A hook to capture R's stdout in a cache.
184 206 '''
185 207 self.Rstdout_cache.append(output)
186 208
187 209 def flush(self):
188 210 '''
189 211 Flush R's stdout cache to a string, returning the string.
190 212 '''
191 213 value = ''.join([str_to_unicode(s, 'utf-8') for s in self.Rstdout_cache])
192 214 self.Rstdout_cache = []
193 215 return value
194 216
195 217 @skip_doctest
196 218 @needs_local_scope
197 219 @line_magic
198 220 def Rpush(self, line, local_ns=None):
199 221 '''
200 222 A line-level magic for R that pushes
201 223 variables from python to rpy2. The line should be made up
202 224 of whitespace separated variable names in the IPython
203 225 namespace::
204 226
205 227 In [7]: import numpy as np
206 228
207 229 In [8]: X = np.array([4.5,6.3,7.9])
208 230
209 231 In [9]: X.mean()
210 232 Out[9]: 6.2333333333333343
211 233
212 234 In [10]: %Rpush X
213 235
214 236 In [11]: %R mean(X)
215 237 Out[11]: array([ 6.23333333])
216 238
217 239 '''
218 240 if local_ns is None:
219 241 local_ns = {}
220 242
221 243 inputs = line.split(' ')
222 244 for input in inputs:
223 245 try:
224 246 val = local_ns[input]
225 247 except KeyError:
226 248 try:
227 249 val = self.shell.user_ns[input]
228 250 except KeyError:
229 251 # reraise the KeyError as a NameError so that it looks like
230 252 # the standard python behavior when you use an unnamed
231 253 # variable
232 254 raise NameError("name '%s' is not defined" % input)
233 255
234 256 self.r.assign(input, self.pyconverter(val))
235 257
236 258 @skip_doctest
237 259 @magic_arguments()
238 260 @argument(
239 261 '-d', '--as_dataframe', action='store_true',
240 262 default=False,
241 263 help='Convert objects to data.frames before returning to ipython.'
242 264 )
243 265 @argument(
244 266 'outputs',
245 267 nargs='*',
246 268 )
247 269 @line_magic
248 270 def Rpull(self, line):
249 271 '''
250 272 A line-level magic for R that pulls
251 273 variables from python to rpy2::
252 274
253 275 In [18]: _ = %R x = c(3,4,6.7); y = c(4,6,7); z = c('a',3,4)
254 276
255 277 In [19]: %Rpull x y z
256 278
257 279 In [20]: x
258 280 Out[20]: array([ 3. , 4. , 6.7])
259 281
260 282 In [21]: y
261 283 Out[21]: array([ 4., 6., 7.])
262 284
263 285 In [22]: z
264 286 Out[22]:
265 287 array(['a', '3', '4'],
266 288 dtype='|S1')
267 289
268 290
269 291 If --as_dataframe, then each object is returned as a structured array
270 292 after first passed through "as.data.frame" in R before
271 293 being calling self.Rconverter.
272 294 This is useful when a structured array is desired as output, or
273 295 when the object in R has mixed data types.
274 296 See the %%R docstring for more examples.
275 297
276 298 Notes
277 299 -----
278 300
279 301 Beware that R names can have '.' so this is not fool proof.
280 302 To avoid this, don't name your R objects with '.'s...
281 303
282 304 '''
283 305 args = parse_argstring(self.Rpull, line)
284 306 outputs = args.outputs
285 307 for output in outputs:
286 308 self.shell.push({output:self.Rconverter(self.r(output),dataframe=args.as_dataframe)})
287 309
288 310 @skip_doctest
289 311 @magic_arguments()
290 312 @argument(
291 313 '-d', '--as_dataframe', action='store_true',
292 314 default=False,
293 315 help='Convert objects to data.frames before returning to ipython.'
294 316 )
295 317 @argument(
296 318 'output',
297 319 nargs=1,
298 320 type=str,
299 321 )
300 322 @line_magic
301 323 def Rget(self, line):
302 324 '''
303 325 Return an object from rpy2, possibly as a structured array (if possible).
304 326 Similar to Rpull except only one argument is accepted and the value is
305 327 returned rather than pushed to self.shell.user_ns::
306 328
307 329 In [3]: dtype=[('x', '<i4'), ('y', '<f8'), ('z', '|S1')]
308 330
309 331 In [4]: datapy = np.array([(1, 2.9, 'a'), (2, 3.5, 'b'), (3, 2.1, 'c'), (4, 5, 'e')], dtype=dtype)
310 332
311 333 In [5]: %R -i datapy
312 334
313 335 In [6]: %Rget datapy
314 336 Out[6]:
315 337 array([['1', '2', '3', '4'],
316 338 ['2', '3', '2', '5'],
317 339 ['a', 'b', 'c', 'e']],
318 340 dtype='|S1')
319 341
320 342 In [7]: %Rget -d datapy
321 343 Out[7]:
322 344 array([(1, 2.9, 'a'), (2, 3.5, 'b'), (3, 2.1, 'c'), (4, 5.0, 'e')],
323 345 dtype=[('x', '<i4'), ('y', '<f8'), ('z', '|S1')])
324 346
325 347 '''
326 348 args = parse_argstring(self.Rget, line)
327 349 output = args.output
328 350 return self.Rconverter(self.r(output[0]),dataframe=args.as_dataframe)
329 351
330 352
331 353 @skip_doctest
332 354 @magic_arguments()
333 355 @argument(
334 356 '-i', '--input', action='append',
335 357 help='Names of input variable from shell.user_ns to be assigned to R variables of the same names after calling self.pyconverter. Multiple names can be passed separated only by commas with no whitespace.'
336 358 )
337 359 @argument(
338 360 '-o', '--output', action='append',
339 361 help='Names of variables to be pushed from rpy2 to shell.user_ns after executing cell body and applying self.Rconverter. Multiple names can be passed separated only by commas with no whitespace.'
340 362 )
341 363 @argument(
342 364 '-w', '--width', type=int,
343 365 help='Width of png plotting device sent as an argument to *png* in R.'
344 366 )
345 367 @argument(
346 368 '-h', '--height', type=int,
347 369 help='Height of png plotting device sent as an argument to *png* in R.'
348 370 )
349 371
350 372 @argument(
351 373 '-d', '--dataframe', action='append',
352 374 help='Convert these objects to data.frames and return as structured arrays.'
353 375 )
354 376 @argument(
355 377 '-u', '--units', type=unicode, choices=["px", "in", "cm", "mm"],
356 378 help='Units of png plotting device sent as an argument to *png* in R. One of ["px", "in", "cm", "mm"].'
357 379 )
358 380 @argument(
359 381 '-r', '--res', type=int,
360 382 help='Resolution of png plotting device sent as an argument to *png* in R. Defaults to 72 if *units* is one of ["in", "cm", "mm"].'
361 383 )
362 384 @argument(
363 385 '-p', '--pointsize', type=int,
364 386 help='Pointsize of png plotting device sent as an argument to *png* in R.'
365 387 )
366 388 @argument(
367 389 '-b', '--bg',
368 390 help='Background of png plotting device sent as an argument to *png* in R.'
369 391 )
370 392 @argument(
371 393 '-n', '--noreturn',
372 394 help='Force the magic to not return anything.',
373 395 action='store_true',
374 396 default=False
375 397 )
376 398 @argument(
377 399 'code',
378 400 nargs='*',
379 401 )
380 402 @needs_local_scope
381 403 @line_cell_magic
382 404 def R(self, line, cell=None, local_ns=None):
383 405 '''
384 406 Execute code in R, and pull some of the results back into the Python namespace.
385 407
386 408 In line mode, this will evaluate an expression and convert the returned value to a Python object.
387 409 The return value is determined by rpy2's behaviour of returning the result of evaluating the
388 410 final line.
389 411
390 412 Multiple R lines can be executed by joining them with semicolons::
391 413
392 414 In [9]: %R X=c(1,4,5,7); sd(X); mean(X)
393 415 Out[9]: array([ 4.25])
394 416
395 417 As a cell, this will run a block of R code, without bringing anything back by default::
396 418
397 419 In [10]: %%R
398 420 ....: Y = c(2,4,3,9)
399 421 ....: print(summary(lm(Y~X)))
400 422 ....:
401 423
402 424 Call:
403 425 lm(formula = Y ~ X)
404 426
405 427 Residuals:
406 428 1 2 3 4
407 429 0.88 -0.24 -2.28 1.64
408 430
409 431 Coefficients:
410 432 Estimate Std. Error t value Pr(>|t|)
411 433 (Intercept) 0.0800 2.3000 0.035 0.975
412 434 X 1.0400 0.4822 2.157 0.164
413 435
414 436 Residual standard error: 2.088 on 2 degrees of freedom
415 437 Multiple R-squared: 0.6993,Adjusted R-squared: 0.549
416 438 F-statistic: 4.651 on 1 and 2 DF, p-value: 0.1638
417 439
418 440 In the notebook, plots are published as the output of the cell.
419 441
420 442 %R plot(X, Y)
421 443
422 444 will create a scatter plot of X bs Y.
423 445
424 446 If cell is not None and line has some R code, it is prepended to
425 447 the R code in cell.
426 448
427 449 Objects can be passed back and forth between rpy2 and python via the -i -o flags in line::
428 450
429 451 In [14]: Z = np.array([1,4,5,10])
430 452
431 453 In [15]: %R -i Z mean(Z)
432 454 Out[15]: array([ 5.])
433 455
434 456
435 457 In [16]: %R -o W W=Z*mean(Z)
436 458 Out[16]: array([ 5., 20., 25., 50.])
437 459
438 460 In [17]: W
439 461 Out[17]: array([ 5., 20., 25., 50.])
440 462
441 463 The return value is determined by these rules:
442 464
443 465 * If the cell is not None, the magic returns None.
444 466
445 467 * If the cell evaluates as False, the resulting value is returned
446 468 unless the final line prints something to the console, in
447 469 which case None is returned.
448 470
449 471 * If the final line results in a NULL value when evaluated
450 472 by rpy2, then None is returned.
451 473
452 474 * No attempt is made to convert the final value to a structured array.
453 475 Use the --dataframe flag or %Rget to push / return a structured array.
454 476
455 477 * If the -n flag is present, there is no return value.
456 478
457 479 * A trailing ';' will also result in no return value as the last
458 480 value in the line is an empty string.
459 481
460 482 The --dataframe argument will attempt to return structured arrays.
461 483 This is useful for dataframes with
462 484 mixed data types. Note also that for a data.frame,
463 485 if it is returned as an ndarray, it is transposed::
464 486
465 487 In [18]: dtype=[('x', '<i4'), ('y', '<f8'), ('z', '|S1')]
466 488
467 489 In [19]: datapy = np.array([(1, 2.9, 'a'), (2, 3.5, 'b'), (3, 2.1, 'c'), (4, 5, 'e')], dtype=dtype)
468 490
469 491 In [20]: %%R -o datar
470 492 datar = datapy
471 493 ....:
472 494
473 495 In [21]: datar
474 496 Out[21]:
475 497 array([['1', '2', '3', '4'],
476 498 ['2', '3', '2', '5'],
477 499 ['a', 'b', 'c', 'e']],
478 500 dtype='|S1')
479 501
480 502 In [22]: %%R -d datar
481 503 datar = datapy
482 504 ....:
483 505
484 506 In [23]: datar
485 507 Out[23]:
486 508 array([(1, 2.9, 'a'), (2, 3.5, 'b'), (3, 2.1, 'c'), (4, 5.0, 'e')],
487 509 dtype=[('x', '<i4'), ('y', '<f8'), ('z', '|S1')])
488 510
489 511 The --dataframe argument first tries colnames, then names.
490 512 If both are NULL, it returns an ndarray (i.e. unstructured)::
491 513
492 514 In [1]: %R mydata=c(4,6,8.3); NULL
493 515
494 516 In [2]: %R -d mydata
495 517
496 518 In [3]: mydata
497 519 Out[3]: array([ 4. , 6. , 8.3])
498 520
499 521 In [4]: %R names(mydata) = c('a','b','c'); NULL
500 522
501 523 In [5]: %R -d mydata
502 524
503 525 In [6]: mydata
504 526 Out[6]:
505 527 array((4.0, 6.0, 8.3),
506 528 dtype=[('a', '<f8'), ('b', '<f8'), ('c', '<f8')])
507 529
508 530 In [7]: %R -o mydata
509 531
510 532 In [8]: mydata
511 533 Out[8]: array([ 4. , 6. , 8.3])
512 534
513 535 '''
514 536
515 537 args = parse_argstring(self.R, line)
516 538
517 539 # arguments 'code' in line are prepended to
518 540 # the cell lines
519 541
520 542 if cell is None:
521 543 code = ''
522 544 return_output = True
523 545 line_mode = True
524 546 else:
525 547 code = cell
526 548 return_output = False
527 549 line_mode = False
528 550
529 551 code = ' '.join(args.code) + code
530 552
531 553 # if there is no local namespace then default to an empty dict
532 554 if local_ns is None:
533 555 local_ns = {}
534 556
535 557 if args.input:
536 558 for input in ','.join(args.input).split(','):
537 559 try:
538 560 val = local_ns[input]
539 561 except KeyError:
540 562 try:
541 563 val = self.shell.user_ns[input]
542 564 except KeyError:
543 565 raise NameError("name '%s' is not defined" % input)
544 566 self.r.assign(input, self.pyconverter(val))
545 567
546 568 if getattr(args, 'units') is not None:
547 569 if args.units != "px" and getattr(args, 'res') is None:
548 570 args.res = 72
549 571 args.units = '"%s"' % args.units
550 572
551 573 png_argdict = dict([(n, getattr(args, n)) for n in ['units', 'res', 'height', 'width', 'bg', 'pointsize']])
552 574 png_args = ','.join(['%s=%s' % (o,v) for o, v in png_argdict.items() if v is not None])
553 575 # execute the R code in a temporary directory
554 576
555 577 tmpd = tempfile.mkdtemp()
556 578 self.r('png("%s/Rplots%%03d.png",%s)' % (tmpd.replace('\\', '/'), png_args))
557 579
558 580 text_output = ''
559 581 if line_mode:
560 582 for line in code.split(';'):
561 583 text_result, result = self.eval(line)
562 584 text_output += text_result
563 585 if text_result:
564 586 # the last line printed something to the console so we won't return it
565 587 return_output = False
566 588 else:
567 589 text_result, result = self.eval(code)
568 590 text_output += text_result
569 591
570 592 self.r('dev.off()')
571 593
572 594 # read out all the saved .png files
573 595
574 596 images = [open(imgfile, 'rb').read() for imgfile in glob("%s/Rplots*png" % tmpd)]
575 597
576 598 # now publish the images
577 599 # mimicking IPython/zmq/pylab/backend_inline.py
578 600 fmt = 'png'
579 601 mimetypes = { 'png' : 'image/png', 'svg' : 'image/svg+xml' }
580 602 mime = mimetypes[fmt]
581 603
582 604 # publish the printed R objects, if any
583 605
584 606 display_data = []
585 607 if text_output:
586 608 display_data.append(('RMagic.R', {'text/plain':text_output}))
587 609
588 610 # flush text streams before sending figures, helps a little with output
589 611 for image in images:
590 612 # synchronization in the console (though it's a bandaid, not a real sln)
591 613 sys.stdout.flush(); sys.stderr.flush()
592 614 display_data.append(('RMagic.R', {mime: image}))
593 615
594 616 # kill the temporary directory
595 617 rmtree(tmpd)
596 618
597 619 # try to turn every output into a numpy array
598 620 # this means that output are assumed to be castable
599 621 # as numpy arrays
600 622
601 623 if args.output:
602 624 for output in ','.join(args.output).split(','):
603 625 self.shell.push({output:self.Rconverter(self.r(output), dataframe=False)})
604 626
605 627 if args.dataframe:
606 628 for output in ','.join(args.dataframe).split(','):
607 629 self.shell.push({output:self.Rconverter(self.r(output), dataframe=True)})
608 630
609 631 for tag, disp_d in display_data:
610 632 publish_display_data(tag, disp_d)
611 633
612 634 # this will keep a reference to the display_data
613 635 # which might be useful to other objects who happen to use
614 636 # this method
615 637
616 638 if self.cache_display_data:
617 639 self.display_cache = display_data
618 640
619 641 # if in line mode and return_output, return the result as an ndarray
620 642 if return_output and not args.noreturn:
621 643 if result != ri.NULL:
622 644 return self.Rconverter(result, dataframe=False)
623 645
624 646 __doc__ = __doc__.format(
625 647 R_DOC = ' '*8 + RMagics.R.__doc__,
626 648 RPUSH_DOC = ' '*8 + RMagics.Rpush.__doc__,
627 649 RPULL_DOC = ' '*8 + RMagics.Rpull.__doc__,
628 650 RGET_DOC = ' '*8 + RMagics.Rget.__doc__
629 651 )
630 652
631 653
632 654 def load_ipython_extension(ip):
633 655 """Load the extension in IPython."""
634 656 ip.register_magics(RMagics)
635 657 # Initialising rpy2 interferes with readline. Since, at this point, we've
636 658 # probably just loaded rpy2, we reset the delimiters. See issue gh-2759.
637 659 if ip.has_readline:
638 660 ip.readline.set_completer_delims(ip.readline_delims)
General Comments 0
You need to be logged in to leave comments. Login now