Source code for pyconversations.message.base

import re
from abc import ABC
from abc import abstractmethod
from datetime import datetime

from ..ld import LangidLangDetect
from ..tokenizers import DefaultTokenizer
from ..tokenizers import LambdaTokenizer
from ..tokenizers import NLTKTokenizer
from ..tokenizers import PartitionTokenizer

# Langauge detection module; do not initialize unless asked for!
DETECTOR = None


def get_detector():
    global DETECTOR
    if DETECTOR is None:
        DETECTOR = LangidLangDetect()

    return DETECTOR


def get_tokenizer(key):
    return {
        'default':     DefaultTokenizer(),
        'NLTK':        NLTKTokenizer(),
        'partitioner': PartitionTokenizer(),
    }[key]


[docs]class UniMessage(ABC):
    """
    The Universal Message class.

    This is designed to be the abstract, baseline object
    that all social media posts / conversation turns
    inherit from.
    The only mandatory field is the uid, a unique field.
    """

    MENTION_REGEX = None
    CLASS_STR = 'UniMessage'

    def __init__(self, uid,
                 text='', author=None,
                 created_at=None, reply_to=None, platform=None, lang=None, tags=None,
                 lang_detect=False, tokenizer='partitioner'):
        """
        UniMessage base class initializer

        Parameters
        ----------
        uid : Hashable
            A unique identifier for the post. The only mandatory field.
        text : str
            Text of the message
        author : Hashable
            The author or some identifier thereof
        created_at : datetime.datetime
            The time of creation
        reply_to : set
            A set of UIDs of the posts this message replies to
        platform : str
            The name of the platform, service, etc. where this message was generated
        lang : str
            The language identifier for the language the text of this post is written (or detected to be written) in
        tags : set
            The set of tagged properties
        lang_detect : bool
            Whether or not language detection should be activated when updating post text
        tokenizer : str or lambda(str -> list(str))
            Which tokenizer to use (Default: partitioner)
        """
        # a unique identifier{
        self._uid = uid

        # the text of the post
        self._text = text

        # the username/name of the author
        self._author = author

        # created datetime object
        self._created_at = created_at

        # collection of IDs this post was generated in reply to
        self._reply_to = set() if not reply_to else set(reply_to)

        # any special tags or identifiers associated with this message
        self._tags = set() if not tags else set(tags)

        # platform name
        self._platform = platform

        # language
        self._lang = lang
        self._lang_detect = lang_detect
        self._detect_language()

        self._tok = tokenizer
        self._init_tokenizer()

    @property
    def uid(self):
        """
        The unique identifier of this object.

        Returns
        -------
        UID
            Unique identifier for this message.
        """
        return self._uid

    @property
    def text(self):
        """
        The text associated with this message.

        Returns
        -------
        str
            Message text
        """
        return self._text

    @text.setter
    def text(self, t):
        """
        Updates the text field of this message.

        Parameters
        ----------
        t : str
            The new text

        Returns
        -------
        None
        """
        self._text = t
        self._lang = None
        self._detect_language()

    @property
    def created_at(self):
        """
        Returns the datetime associated with this message.

        Returns
        -------
        datetime.datetime
            Time of creation of post. Could be None if not available/processed.
        """
        return self._created_at

    @created_at.setter
    def created_at(self, x):
        """
        Updates the timesttamp for when this message was created.

        Parameters
        ----------
        x : str or float
            The new datetime

        Returns
        -------
        None

        Raises
        ------
        TypeError
            When setting this property with a value that is not a string nor a float.
        """
        if type(x) == str:
            self._created_at = self.parse_datestr(x)
        elif type(x) == float:
            self._created_at = datetime.fromtimestamp(x)
        else:
            raise TypeError(f'Unrecognized created_at conversion: {type(x)} --> {x}')

    @property
    def author(self):
        """
        Returns the author of this message.

        Returns
        -------
        str
            Author name/username
        """
        return self._author

    @author.setter
    def author(self, a):
        """
        Updates the author of this message.

        Parameters
        ----------
        a : str
            The new author

        Returns
        -------
        None
        """
        self._author = a

    @property
    def reply_to(self):
        """
        Returns the unique identifiers of the messages that are replied to by this message.

        Returns
        -------
        set(UID)
            The set of UIDs of the posts this message replies to
        """
        return self._reply_to

    @property
    def tags(self):
        """
        Returns the tags associated with this message.

        Returns
        -------
        set(str)
            Set of string tags associated with this message
        """
        return self._tags

    @property
    def platform(self):
        """
        The platform this message was created on

        Returns
        -------
        str
            Platform name
        """
        return self._platform

    @platform.setter
    def platform(self, p):
        """
        Updates the platform this message is from.

        Parameters
        ----------
        p : str
            The platform name

        Returns
        -------
        None
        """
        self._platform = p

    @property
    def lang(self):
        """
        Returns the language this post was written in

        Returns
        -------
        str
            Language code of the message text
        """
        return self._lang

    @lang.setter
    def lang(self, lang):
        """
        Updates the language this post was written in

        Parameters
        ----------
        lang : str
            The language associated with this post

        Returns
        -------
        None
        """
        self._lang = lang

    def __hash__(self):
        return hash(self._uid)

    def __repr__(self):
        return f'{self.CLASS_STR}({self._platform}::{self._author}::{self._created_at}::{self._text[:50]}::tags={",".join(self._tags)})'

    def __ior__(self, other):
        # Setting this to always take the larger text chunk...
        if len(self._text) < len(other.text):
            self._text = other.text

        if self._author is None:
            self._author = other.author

        if self._created_at is None:
            self._created_at = other.created_at
        elif self._created_at and other.created_at and other.created_at < self._created_at:
            self._created_at = other.created_at

        if self._lang is None:
            self._lang = other.lang

        self._reply_to |= other.reply_to
        self._tags |= other.tags

        return self

    def _init_tokenizer(self):
        """
        Sub-selects the tokenizer to use in this class.
        """
        if callable(self._tok):
            self._tok = LambdaTokenizer(self._tok)
        elif type(self._tok) == str:
            # load from dictionary of available choices
            self._tok = get_tokenizer(self._tok)
        else:
            raise ValueError(f'UniMessage._init_tokenizer. Unrecognized value: {self._tok}')

    def _detect_language(self):
        """
        Classifies the text of the post and updates the language field, if asked for.
        """
        if (not self._lang or self.lang == 'und') and self._lang_detect and self._text:
            res = get_detector().get(text=self.text)
            self.lang = res[0] if res[1] >= 0.5 else 'und'

[docs]    @classmethod
    def from_json(cls, data):
        """
        Given an exported JSON object for a Universal Message,
        this function loads the saved data into its fields

        Parameters
        ----------
        data : JSON/dict
            The raw message JSON

        Returns
        -------
        Message class
            Created inherited UniMessage object
        """
        data['created_at'] = datetime.fromtimestamp(data['created_at']) if data['created_at'] else None
        return cls(**data)

[docs]    @staticmethod
    @abstractmethod
    def parse_raw(raw, lang_detect=False):
        """
        Abstract static method that must be implemented by all non-abstract child classes.
        Concrete implementations should specify how to parse the raw data into this object.

        Parameters
        ----------
        raw : JSON/dict
            The raw data to be pre-processed.
        lang_detect : bool
            A boolean which specifies whether language detection should be activated. (Default: False)
        """
        raise NotImplementedError

[docs]    @staticmethod
    @abstractmethod
    def parse_datestr(x):
        """
        Abstract static method that specifies how to convert the native datetime string
        into a a Python datetime object.

        Parameters
        ----------
        x : str
            The raw datetime string
        """
        raise NotImplementedError

[docs]    def add_reply_to(self, tid):
        """
        Adds a new UID that this message is replying to.

        Parameters
        ----------
        tid : UID
            The UID to be added

        Returns
        -------
        None
        """
        self._reply_to.add(tid)

[docs]    def remove_reply_to(self, tid):
        """
        Removes a UID from the set this message is replying to.

        Parameters
        ----------
        tid : UID
            The UID to be removed
        """
        self._reply_to.remove(tid)

[docs]    def add_tag(self, tag):
        """
        Adds a new tag to this message.

        Parameters
        ----------
        tag : str
            The tag to be added

        Returns
        -------
        None
        """
        self._tags.add(tag)

[docs]    def remove_tag(self, tag):
        """
        Removes a tag from this message.

        Parameters
        ----------
        tag : str
            The tag to remove

        Returns
        -------
        None
        """
        self._tags.remove(tag)

[docs]    def to_json(self):
        """
        Function for exporting a Universal Post into a JSON object for storage and later use

        Returns
        -------
        JSON/dict
            The JSON formatted UniMessage for disk storage
        """
        return {
            'uid':        self._uid,
            'text':       self.text,
            'author':     self.author,
            'created_at': self.created_at.timestamp() if self.created_at else None,
            'reply_to':   list(self.reply_to),
            'platform':   self.platform,
            'tags':       list(self._tags),
            'lang':       self._lang
        }

[docs]    def get_mentions(self):
        """
        By default, this will simply return the author
        of the post (if available) for appropriate anonymization

        Returns
        -------
        set(str)
            The mentions detected in this message
        """
        if self.author:
            return {self.author}

        return set()

[docs]    def redact(self, redact_map):
        """
        Given a set of terms, this function will properly redact
        all instances of those terms.
        This function is mainly to use for redacting usernames
        or user mentions, so as to protect user privacy.

        Parameters
        ----------
        redact_map : dict(str, str)
            The map of terms and what they should be replaced with

        Returns
        -------
        None
        """
        if self.text:
            for term, replacement in redact_map.items():
                if term in self.text:
                    self.text = re.sub(term, replacement, self.text)

        # Change the author's name if they're in our redaction map
        if self.author in redact_map:
            self.author = redact_map[self.author]

    @property
    def tokens(self):
        """
        Tokenizes the text of this message

        Returns
        -------
        list(str)
            The tokenized text
        """
        return self._tok.tokenize(self.text)