453 lines
14 KiB
Python
453 lines
14 KiB
Python
|
import decimal
|
||
|
import json as _json
|
||
|
import sys
|
||
|
import re
|
||
|
from functools import reduce
|
||
|
|
||
|
from _plotly_utils.optional_imports import get_module
|
||
|
from _plotly_utils.basevalidators import ImageUriValidator
|
||
|
|
||
|
|
||
|
def cumsum(x):
|
||
|
"""
|
||
|
Custom cumsum to avoid a numpy import.
|
||
|
"""
|
||
|
|
||
|
def _reducer(a, x):
|
||
|
if len(a) == 0:
|
||
|
return [x]
|
||
|
return a + [a[-1] + x]
|
||
|
|
||
|
ret = reduce(_reducer, x, [])
|
||
|
return ret
|
||
|
|
||
|
|
||
|
class PlotlyJSONEncoder(_json.JSONEncoder):
|
||
|
"""
|
||
|
Meant to be passed as the `cls` kwarg to json.dumps(obj, cls=..)
|
||
|
|
||
|
See PlotlyJSONEncoder.default for more implementation information.
|
||
|
|
||
|
Additionally, this encoder overrides nan functionality so that 'Inf',
|
||
|
'NaN' and '-Inf' encode to 'null'. Which is stricter JSON than the Python
|
||
|
version.
|
||
|
|
||
|
"""
|
||
|
|
||
|
def coerce_to_strict(self, const):
|
||
|
"""
|
||
|
This is used to ultimately *encode* into strict JSON, see `encode`
|
||
|
|
||
|
"""
|
||
|
# before python 2.7, 'true', 'false', 'null', were include here.
|
||
|
if const in ("Infinity", "-Infinity", "NaN"):
|
||
|
return None
|
||
|
else:
|
||
|
return const
|
||
|
|
||
|
def encode(self, o):
|
||
|
"""
|
||
|
Load and then dump the result using parse_constant kwarg
|
||
|
|
||
|
Note that setting invalid separators will cause a failure at this step.
|
||
|
|
||
|
"""
|
||
|
# this will raise errors in a normal-expected way
|
||
|
encoded_o = super(PlotlyJSONEncoder, self).encode(o)
|
||
|
# Brute force guessing whether NaN or Infinity values are in the string
|
||
|
# We catch false positive cases (e.g. strings such as titles, labels etc.)
|
||
|
# but this is ok since the intention is to skip the decoding / reencoding
|
||
|
# step when it's completely safe
|
||
|
|
||
|
if not ("NaN" in encoded_o or "Infinity" in encoded_o):
|
||
|
return encoded_o
|
||
|
|
||
|
# now:
|
||
|
# 1. `loads` to switch Infinity, -Infinity, NaN to None
|
||
|
# 2. `dumps` again so you get 'null' instead of extended JSON
|
||
|
try:
|
||
|
new_o = _json.loads(encoded_o, parse_constant=self.coerce_to_strict)
|
||
|
except ValueError:
|
||
|
|
||
|
# invalid separators will fail here. raise a helpful exception
|
||
|
raise ValueError(
|
||
|
"Encoding into strict JSON failed. Did you set the separators "
|
||
|
"valid JSON separators?"
|
||
|
)
|
||
|
else:
|
||
|
return _json.dumps(
|
||
|
new_o,
|
||
|
sort_keys=self.sort_keys,
|
||
|
indent=self.indent,
|
||
|
separators=(self.item_separator, self.key_separator),
|
||
|
)
|
||
|
|
||
|
def default(self, obj):
|
||
|
"""
|
||
|
Accept an object (of unknown type) and try to encode with priority:
|
||
|
1. builtin: user-defined objects
|
||
|
2. sage: sage math cloud
|
||
|
3. pandas: dataframes/series
|
||
|
4. numpy: ndarrays
|
||
|
5. datetime: time/datetime objects
|
||
|
|
||
|
Each method throws a NotEncoded exception if it fails.
|
||
|
|
||
|
The default method will only get hit if the object is not a type that
|
||
|
is naturally encoded by json:
|
||
|
|
||
|
Normal objects:
|
||
|
dict object
|
||
|
list, tuple array
|
||
|
str, unicode string
|
||
|
int, long, float number
|
||
|
True true
|
||
|
False false
|
||
|
None null
|
||
|
|
||
|
Extended objects:
|
||
|
float('nan') 'NaN'
|
||
|
float('infinity') 'Infinity'
|
||
|
float('-infinity') '-Infinity'
|
||
|
|
||
|
Therefore, we only anticipate either unknown iterables or values here.
|
||
|
|
||
|
"""
|
||
|
# TODO: The ordering if these methods is *very* important. Is this OK?
|
||
|
encoding_methods = (
|
||
|
self.encode_as_plotly,
|
||
|
self.encode_as_sage,
|
||
|
self.encode_as_numpy,
|
||
|
self.encode_as_pandas,
|
||
|
self.encode_as_datetime,
|
||
|
self.encode_as_date,
|
||
|
self.encode_as_list, # because some values have `tolist` do last.
|
||
|
self.encode_as_decimal,
|
||
|
self.encode_as_pil,
|
||
|
)
|
||
|
for encoding_method in encoding_methods:
|
||
|
try:
|
||
|
return encoding_method(obj)
|
||
|
except NotEncodable:
|
||
|
pass
|
||
|
return _json.JSONEncoder.default(self, obj)
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_plotly(obj):
|
||
|
"""Attempt to use a builtin `to_plotly_json` method."""
|
||
|
try:
|
||
|
return obj.to_plotly_json()
|
||
|
except AttributeError:
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_list(obj):
|
||
|
"""Attempt to use `tolist` method to convert to normal Python list."""
|
||
|
if hasattr(obj, "tolist"):
|
||
|
return obj.tolist()
|
||
|
else:
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_sage(obj):
|
||
|
"""Attempt to convert sage.all.RR to floats and sage.all.ZZ to ints"""
|
||
|
sage_all = get_module("sage.all")
|
||
|
if not sage_all:
|
||
|
raise NotEncodable
|
||
|
|
||
|
if obj in sage_all.RR:
|
||
|
return float(obj)
|
||
|
elif obj in sage_all.ZZ:
|
||
|
return int(obj)
|
||
|
else:
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_pandas(obj):
|
||
|
"""Attempt to convert pandas.NaT / pandas.NA"""
|
||
|
pandas = get_module("pandas", should_load=False)
|
||
|
if not pandas:
|
||
|
raise NotEncodable
|
||
|
|
||
|
if obj is pandas.NaT:
|
||
|
return None
|
||
|
|
||
|
# pandas.NA was introduced in pandas 1.0
|
||
|
if hasattr(pandas, "NA") and obj is pandas.NA:
|
||
|
return None
|
||
|
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_numpy(obj):
|
||
|
"""Attempt to convert numpy.ma.core.masked"""
|
||
|
numpy = get_module("numpy", should_load=False)
|
||
|
if not numpy:
|
||
|
raise NotEncodable
|
||
|
|
||
|
if obj is numpy.ma.core.masked:
|
||
|
return float("nan")
|
||
|
elif isinstance(obj, numpy.ndarray) and obj.dtype.kind == "M":
|
||
|
try:
|
||
|
return numpy.datetime_as_string(obj).tolist()
|
||
|
except TypeError:
|
||
|
pass
|
||
|
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_datetime(obj):
|
||
|
"""Convert datetime objects to iso-format strings"""
|
||
|
try:
|
||
|
return obj.isoformat()
|
||
|
except AttributeError:
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_date(obj):
|
||
|
"""Attempt to convert to utc-iso time string using date methods."""
|
||
|
try:
|
||
|
time_string = obj.isoformat()
|
||
|
except AttributeError:
|
||
|
raise NotEncodable
|
||
|
else:
|
||
|
return iso_to_plotly_time_string(time_string)
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_decimal(obj):
|
||
|
"""Attempt to encode decimal by converting it to float"""
|
||
|
if isinstance(obj, decimal.Decimal):
|
||
|
return float(obj)
|
||
|
else:
|
||
|
raise NotEncodable
|
||
|
|
||
|
@staticmethod
|
||
|
def encode_as_pil(obj):
|
||
|
"""Attempt to convert PIL.Image.Image to base64 data uri"""
|
||
|
image = get_module("PIL.Image")
|
||
|
if image is not None and isinstance(obj, image.Image):
|
||
|
return ImageUriValidator.pil_image_to_uri(obj)
|
||
|
else:
|
||
|
raise NotEncodable
|
||
|
|
||
|
|
||
|
class NotEncodable(Exception):
|
||
|
pass
|
||
|
|
||
|
|
||
|
def iso_to_plotly_time_string(iso_string):
|
||
|
"""Remove timezone info and replace 'T' delimeter with ' ' (ws)."""
|
||
|
# make sure we don't send timezone info to plotly
|
||
|
if (iso_string.split("-")[:3] == "00:00") or (iso_string.split("+")[0] == "00:00"):
|
||
|
raise Exception(
|
||
|
"Plotly won't accept timestrings with timezone info.\n"
|
||
|
"All timestrings are assumed to be in UTC."
|
||
|
)
|
||
|
|
||
|
iso_string = iso_string.replace("-00:00", "").replace("+00:00", "")
|
||
|
|
||
|
if iso_string.endswith("T00:00:00"):
|
||
|
return iso_string.replace("T00:00:00", "")
|
||
|
else:
|
||
|
return iso_string.replace("T", " ")
|
||
|
|
||
|
|
||
|
def template_doc(**names):
|
||
|
def _decorator(func):
|
||
|
if not sys.version_info[:2] == (3, 2):
|
||
|
if func.__doc__ is not None:
|
||
|
func.__doc__ = func.__doc__.format(**names)
|
||
|
return func
|
||
|
|
||
|
return _decorator
|
||
|
|
||
|
|
||
|
def _natural_sort_strings(vals, reverse=False):
|
||
|
def key(v):
|
||
|
v_parts = re.split(r"(\d+)", v)
|
||
|
for i in range(len(v_parts)):
|
||
|
try:
|
||
|
v_parts[i] = int(v_parts[i])
|
||
|
except ValueError:
|
||
|
# not an int
|
||
|
pass
|
||
|
return tuple(v_parts)
|
||
|
|
||
|
return sorted(vals, key=key, reverse=reverse)
|
||
|
|
||
|
|
||
|
def _get_int_type():
|
||
|
np = get_module("numpy", should_load=False)
|
||
|
if np:
|
||
|
int_type = (int, np.integer)
|
||
|
else:
|
||
|
int_type = (int,)
|
||
|
return int_type
|
||
|
|
||
|
|
||
|
def split_multichar(ss, chars):
|
||
|
"""
|
||
|
Split all the strings in ss at any of the characters in chars.
|
||
|
Example:
|
||
|
|
||
|
>>> ss = ["a.string[0].with_separators"]
|
||
|
>>> chars = list(".[]_")
|
||
|
>>> split_multichar(ss, chars)
|
||
|
['a', 'string', '0', '', 'with', 'separators']
|
||
|
|
||
|
:param (list) ss: A list of strings.
|
||
|
:param (list) chars: Is a list of chars (note: not a string).
|
||
|
"""
|
||
|
if len(chars) == 0:
|
||
|
return ss
|
||
|
c = chars.pop()
|
||
|
ss = reduce(lambda x, y: x + y, map(lambda x: x.split(c), ss))
|
||
|
return split_multichar(ss, chars)
|
||
|
|
||
|
|
||
|
def split_string_positions(ss):
|
||
|
"""
|
||
|
Given a list of strings split using split_multichar, return a list of
|
||
|
integers representing the indices of the first character of every string in
|
||
|
the original string.
|
||
|
Example:
|
||
|
|
||
|
>>> ss = ["a.string[0].with_separators"]
|
||
|
>>> chars = list(".[]_")
|
||
|
>>> ss_split = split_multichar(ss, chars)
|
||
|
>>> ss_split
|
||
|
['a', 'string', '0', '', 'with', 'separators']
|
||
|
>>> split_string_positions(ss_split)
|
||
|
[0, 2, 9, 11, 12, 17]
|
||
|
|
||
|
:param (list) ss: A list of strings.
|
||
|
"""
|
||
|
return list(
|
||
|
map(
|
||
|
lambda t: t[0] + t[1],
|
||
|
zip(range(len(ss)), cumsum([0] + list(map(len, ss[:-1])))),
|
||
|
)
|
||
|
)
|
||
|
|
||
|
|
||
|
def display_string_positions(p, i=None, offset=0, length=1, char="^", trim=True):
|
||
|
"""
|
||
|
Return a string that is whitespace except at p[i] which is replaced with char.
|
||
|
If i is None then all the indices of the string in p are replaced with char.
|
||
|
|
||
|
Example:
|
||
|
|
||
|
>>> ss = ["a.string[0].with_separators"]
|
||
|
>>> chars = list(".[]_")
|
||
|
>>> ss_split = split_multichar(ss, chars)
|
||
|
>>> ss_split
|
||
|
['a', 'string', '0', '', 'with', 'separators']
|
||
|
>>> ss_pos = split_string_positions(ss_split)
|
||
|
>>> ss[0]
|
||
|
'a.string[0].with_separators'
|
||
|
>>> display_string_positions(ss_pos,4)
|
||
|
' ^'
|
||
|
>>> display_string_positions(ss_pos,4,offset=1,length=3,char="~",trim=False)
|
||
|
' ~~~ '
|
||
|
>>> display_string_positions(ss_pos)
|
||
|
'^ ^ ^ ^^ ^'
|
||
|
:param (list) p: A list of integers.
|
||
|
:param (integer|None) i: Optional index of p to display.
|
||
|
:param (integer) offset: Allows adding a number of spaces to the replacement.
|
||
|
:param (integer) length: Allows adding a replacement that is the char
|
||
|
repeated length times.
|
||
|
:param (str) char: allows customizing the replacement character.
|
||
|
:param (boolean) trim: trims the remaining whitespace if True.
|
||
|
"""
|
||
|
s = [" " for _ in range(max(p) + 1 + offset + length)]
|
||
|
maxaddr = 0
|
||
|
if i is None:
|
||
|
for p_ in p:
|
||
|
for l in range(length):
|
||
|
maxaddr = p_ + offset + l
|
||
|
s[maxaddr] = char
|
||
|
else:
|
||
|
for l in range(length):
|
||
|
maxaddr = p[i] + offset + l
|
||
|
s[maxaddr] = char
|
||
|
ret = "".join(s)
|
||
|
if trim:
|
||
|
ret = ret[: maxaddr + 1]
|
||
|
return ret
|
||
|
|
||
|
|
||
|
def chomp_empty_strings(strings, c, reverse=False):
|
||
|
"""
|
||
|
Given a list of strings, some of which are the empty string "", replace the
|
||
|
empty strings with c and combine them with the closest non-empty string on
|
||
|
the left or "" if it is the first string.
|
||
|
Examples:
|
||
|
for c="_"
|
||
|
['hey', '', 'why', '', '', 'whoa', '', ''] -> ['hey_', 'why__', 'whoa__']
|
||
|
['', 'hi', '', "I'm", 'bob', '', ''] -> ['_', 'hi_', "I'm", 'bob__']
|
||
|
['hi', "i'm", 'a', 'good', 'string'] -> ['hi', "i'm", 'a', 'good', 'string']
|
||
|
Some special cases are:
|
||
|
[] -> []
|
||
|
[''] -> ['']
|
||
|
['', ''] -> ['_']
|
||
|
['', '', '', ''] -> ['___']
|
||
|
If reverse is true, empty strings are combined with closest non-empty string
|
||
|
on the right or "" if it is the last string.
|
||
|
"""
|
||
|
|
||
|
def _rev(l):
|
||
|
return [s[::-1] for s in l][::-1]
|
||
|
|
||
|
if reverse:
|
||
|
return _rev(chomp_empty_strings(_rev(strings), c))
|
||
|
if not len(strings):
|
||
|
return strings
|
||
|
if sum(map(len, strings)) == 0:
|
||
|
return [c * (len(strings) - 1)]
|
||
|
|
||
|
class _Chomper:
|
||
|
def __init__(self, c):
|
||
|
self.c = c
|
||
|
|
||
|
def __call__(self, x, y):
|
||
|
# x is list up to now
|
||
|
# y is next item in list
|
||
|
# x should be [""] initially, and then empty strings filtered out at the
|
||
|
# end
|
||
|
if len(y) == 0:
|
||
|
return x[:-1] + [x[-1] + self.c]
|
||
|
else:
|
||
|
return x + [y]
|
||
|
|
||
|
return list(filter(len, reduce(_Chomper(c), strings, [""])))
|
||
|
|
||
|
|
||
|
# taken from
|
||
|
# https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
|
||
|
def levenshtein(s1, s2):
|
||
|
if len(s1) < len(s2):
|
||
|
return levenshtein(s2, s1) # len(s1) >= len(s2)
|
||
|
if len(s2) == 0:
|
||
|
return len(s1)
|
||
|
previous_row = range(len(s2) + 1)
|
||
|
for i, c1 in enumerate(s1):
|
||
|
current_row = [i + 1]
|
||
|
for j, c2 in enumerate(s2):
|
||
|
# j+1 instead of j since previous_row and current_row are one character longer
|
||
|
# than s2
|
||
|
insertions = previous_row[j + 1] + 1
|
||
|
deletions = current_row[j] + 1
|
||
|
substitutions = previous_row[j] + (c1 != c2)
|
||
|
current_row.append(min(insertions, deletions, substitutions))
|
||
|
previous_row = current_row
|
||
|
return previous_row[-1]
|
||
|
|
||
|
|
||
|
def find_closest_string(string, strings):
|
||
|
def _key(s):
|
||
|
# sort by levenshtein distance and lexographically to maintain a stable
|
||
|
# sort for different keys with the same levenshtein distance
|
||
|
return (levenshtein(s, string), s)
|
||
|
|
||
|
return sorted(strings, key=_key)[0]
|