Source code for pyconversations.message.base

import re
from abc import ABC
from abc import abstractmethod
from datetime import datetime

from ..ld import LangidLangDetect
from ..tokenizers import DefaultTokenizer
from ..tokenizers import LambdaTokenizer
from ..tokenizers import NLTKTokenizer
from ..tokenizers import PartitionTokenizer

# Langauge detection module; do not initialize unless asked for!
DETECTOR = None


def get_detector():
    global DETECTOR
    if DETECTOR is None:
        DETECTOR = LangidLangDetect()

    return DETECTOR


def get_tokenizer(key):
    return {
        'default':     DefaultTokenizer(),
        'NLTK':        NLTKTokenizer(),
        'partitioner': PartitionTokenizer(),
    }[key]


[docs]class UniMessage(ABC): """ The Universal Message class. This is designed to be the abstract, baseline object that all social media posts / conversation turns inherit from. The only mandatory field is the uid, a unique field. """ MENTION_REGEX = None CLASS_STR = 'UniMessage' def __init__(self, uid, text='', author=None, created_at=None, reply_to=None, platform=None, lang=None, tags=None, lang_detect=False, tokenizer='partitioner'): """ UniMessage base class initializer Parameters ---------- uid : Hashable A unique identifier for the post. The only mandatory field. text : str Text of the message author : Hashable The author or some identifier thereof created_at : datetime.datetime The time of creation reply_to : set A set of UIDs of the posts this message replies to platform : str The name of the platform, service, etc. where this message was generated lang : str The language identifier for the language the text of this post is written (or detected to be written) in tags : set The set of tagged properties lang_detect : bool Whether or not language detection should be activated when updating post text tokenizer : str or lambda(str -> list(str)) Which tokenizer to use (Default: partitioner) """ # a unique identifier{ self._uid = uid # the text of the post self._text = text # the username/name of the author self._author = author # created datetime object self._created_at = created_at # collection of IDs this post was generated in reply to self._reply_to = set() if not reply_to else set(reply_to) # any special tags or identifiers associated with this message self._tags = set() if not tags else set(tags) # platform name self._platform = platform # language self._lang = lang self._lang_detect = lang_detect self._detect_language() self._tok = tokenizer self._init_tokenizer() @property def uid(self): """ The unique identifier of this object. Returns ------- UID Unique identifier for this message. """ return self._uid @property def text(self): """ The text associated with this message. Returns ------- str Message text """ return self._text @text.setter def text(self, t): """ Updates the text field of this message. Parameters ---------- t : str The new text Returns ------- None """ self._text = t self._lang = None self._detect_language() @property def created_at(self): """ Returns the datetime associated with this message. Returns ------- datetime.datetime Time of creation of post. Could be None if not available/processed. """ return self._created_at @created_at.setter def created_at(self, x): """ Updates the timesttamp for when this message was created. Parameters ---------- x : str or float The new datetime Returns ------- None Raises ------ TypeError When setting this property with a value that is not a string nor a float. """ if type(x) == str: self._created_at = self.parse_datestr(x) elif type(x) == float: self._created_at = datetime.fromtimestamp(x) else: raise TypeError(f'Unrecognized created_at conversion: {type(x)} --> {x}') @property def author(self): """ Returns the author of this message. Returns ------- str Author name/username """ return self._author @author.setter def author(self, a): """ Updates the author of this message. Parameters ---------- a : str The new author Returns ------- None """ self._author = a @property def reply_to(self): """ Returns the unique identifiers of the messages that are replied to by this message. Returns ------- set(UID) The set of UIDs of the posts this message replies to """ return self._reply_to @property def tags(self): """ Returns the tags associated with this message. Returns ------- set(str) Set of string tags associated with this message """ return self._tags @property def platform(self): """ The platform this message was created on Returns ------- str Platform name """ return self._platform @platform.setter def platform(self, p): """ Updates the platform this message is from. Parameters ---------- p : str The platform name Returns ------- None """ self._platform = p @property def lang(self): """ Returns the language this post was written in Returns ------- str Language code of the message text """ return self._lang @lang.setter def lang(self, lang): """ Updates the language this post was written in Parameters ---------- lang : str The language associated with this post Returns ------- None """ self._lang = lang def __hash__(self): return hash(self._uid) def __repr__(self): return f'{self.CLASS_STR}({self._platform}::{self._author}::{self._created_at}::{self._text[:50]}::tags={",".join(self._tags)})' def __ior__(self, other): # Setting this to always take the larger text chunk... if len(self._text) < len(other.text): self._text = other.text if self._author is None: self._author = other.author if self._created_at is None: self._created_at = other.created_at elif self._created_at and other.created_at and other.created_at < self._created_at: self._created_at = other.created_at if self._lang is None: self._lang = other.lang self._reply_to |= other.reply_to self._tags |= other.tags return self def _init_tokenizer(self): """ Sub-selects the tokenizer to use in this class. """ if callable(self._tok): self._tok = LambdaTokenizer(self._tok) elif type(self._tok) == str: # load from dictionary of available choices self._tok = get_tokenizer(self._tok) else: raise ValueError(f'UniMessage._init_tokenizer. Unrecognized value: {self._tok}') def _detect_language(self): """ Classifies the text of the post and updates the language field, if asked for. """ if (not self._lang or self.lang == 'und') and self._lang_detect and self._text: res = get_detector().get(text=self.text) self.lang = res[0] if res[1] >= 0.5 else 'und'
[docs] @classmethod def from_json(cls, data): """ Given an exported JSON object for a Universal Message, this function loads the saved data into its fields Parameters ---------- data : JSON/dict The raw message JSON Returns ------- Message class Created inherited UniMessage object """ data['created_at'] = datetime.fromtimestamp(data['created_at']) if data['created_at'] else None return cls(**data)
[docs] @staticmethod @abstractmethod def parse_raw(raw, lang_detect=False): """ Abstract static method that must be implemented by all non-abstract child classes. Concrete implementations should specify how to parse the raw data into this object. Parameters ---------- raw : JSON/dict The raw data to be pre-processed. lang_detect : bool A boolean which specifies whether language detection should be activated. (Default: False) """ raise NotImplementedError
[docs] @staticmethod @abstractmethod def parse_datestr(x): """ Abstract static method that specifies how to convert the native datetime string into a a Python datetime object. Parameters ---------- x : str The raw datetime string """ raise NotImplementedError
[docs] def add_reply_to(self, tid): """ Adds a new UID that this message is replying to. Parameters ---------- tid : UID The UID to be added Returns ------- None """ self._reply_to.add(tid)
[docs] def remove_reply_to(self, tid): """ Removes a UID from the set this message is replying to. Parameters ---------- tid : UID The UID to be removed """ self._reply_to.remove(tid)
[docs] def add_tag(self, tag): """ Adds a new tag to this message. Parameters ---------- tag : str The tag to be added Returns ------- None """ self._tags.add(tag)
[docs] def remove_tag(self, tag): """ Removes a tag from this message. Parameters ---------- tag : str The tag to remove Returns ------- None """ self._tags.remove(tag)
[docs] def to_json(self): """ Function for exporting a Universal Post into a JSON object for storage and later use Returns ------- JSON/dict The JSON formatted UniMessage for disk storage """ return { 'uid': self._uid, 'text': self.text, 'author': self.author, 'created_at': self.created_at.timestamp() if self.created_at else None, 'reply_to': list(self.reply_to), 'platform': self.platform, 'tags': list(self._tags), 'lang': self._lang }
[docs] def get_mentions(self): """ By default, this will simply return the author of the post (if available) for appropriate anonymization Returns ------- set(str) The mentions detected in this message """ if self.author: return {self.author} return set()
[docs] def redact(self, redact_map): """ Given a set of terms, this function will properly redact all instances of those terms. This function is mainly to use for redacting usernames or user mentions, so as to protect user privacy. Parameters ---------- redact_map : dict(str, str) The map of terms and what they should be replaced with Returns ------- None """ if self.text: for term, replacement in redact_map.items(): if term in self.text: self.text = re.sub(term, replacement, self.text) # Change the author's name if they're in our redaction map if self.author in redact_map: self.author = redact_map[self.author]
@property def tokens(self): """ Tokenizes the text of this message Returns ------- list(str) The tokenized text """ return self._tok.tokenize(self.text)