Source code for pyconversations.convo

from collections import defaultdict

import networkx as nx

from .message import get_constructor_by_platform


[docs]class Conversation:
    """
    A container class for managing collections of UniMessage (post) objects.
    """

[docs]    def __init__(self, posts=None, convo_id=None):
        """
        Constructor for Conversation object.

        Parameters
        ---------
        posts
            An optional dictionary of messages/posts; keys should be unique IDs.
        """
        if not posts:
            posts = {}

        self._posts = posts  # uid -> post object
        self._convo_id = convo_id

        self._relation_map = defaultdict(dict)
        self._author_set = set()

[docs]    def __add__(self, other):
        """
        Defines the addition operation over Conversation objects.
        Returns a new copy of a conversation.

        Parameters
        ---------
        other : UniMessage
            Another conversation to be added to this one.

        Returns
        -------
        Conversation
            The combination of this conversation and the conversation in `other`
        """

        convo = Conversation(convo_id=self.convo_id + '++' + other.convo_id)
        for post in other.posts.values():
            convo.add_post(post)
        for post in self.posts.values():
            convo.add_post(post)
        return convo

    @property
    def posts(self):
        """
        Returns a dictionary of posts, keyed by their UIDs.

        Returns
        -------
        dict(UID, UniMessage)
            The dictionary of posts contained in this Conversation object
        """
        return self._posts

    @property
    def convo_id(self):
        """
        The conversation identifier

        Returns
        -------
        Any (or str)
            Returns a conversation identifier. Creates ones from sources if unspecified.
        """
        return self._convo_id if self._convo_id else 'CONV_' + '-'.join(map(str, sorted(self.get_sources())))

    @property
    def authors(self):
        if not self._author_set:
            self._author_set = set([self.posts[pid].author for pid in self.posts])

        return self._author_set

[docs]    def add_post(self, post):
        """
        Adds a post to the conversational container.

        Parameters
        ---------
        post : UniMessage, or derivative concrete class
            The post object to be added.

        Returns
        -------
        None
        """
        if post.uid in self._posts and self._posts[post.uid]:
            self._posts[post.uid] |= post
        else:
            self._posts[post.uid] = post

        self._author_set.add(post.author)

[docs]    def remove_post(self, uid):
        """
        Deletes a post from the conversational container using its UID.

        Parameters
        ---------
        uid : Hashable
            Unique identifier for the post to delete.

        Returns
        -------
        None
        """
        del self._posts[uid]
        self._author_set = set()

[docs]    def as_graph(self):
        """
        Constructs (and returns) a networkx Graph object
        from the contained posts and edges.

        Returns
        -------
        networkx.Graph
            The networkx graph associated with this Conversation
        """
        graph = nx.Graph()

        # add posts as nodes
        for uid in self._posts:
            graph.add_node(uid)

        # add reply connections as edges
        for uid, post in self._posts.items():
            for rid in post.reply_to:
                if uid in self._posts and rid in self._posts:
                    graph.add_edge(uid, rid)

        return graph

[docs]    def segment(self):
        """
        Segments a conversation into disjoint (i.e., not connected by any replies) sub-conversations.
        If a single conversation is contained in this object,
        this function will return a list with a single element: a copy of this object.

        Returns
        -------
        list(Conversation)
            A list of sub-conversations
        """
        segments = []
        for node_set in nx.connected_components(self.as_graph()):
            convo = Conversation()
            for uid in node_set:
                convo.add_post(self.posts[uid])
            segments.append(convo)

        return segments

[docs]    def to_json(self):
        """
        Returns a JSON representation of this object.

        Returns
        -------
        list(JSON/dict)
            The dictionary/JSON representation of the Conversation
        """
        return [post.to_json() for post in self.posts.values()]

[docs]    @staticmethod
    def from_json(raw):
        """
        Converts a JSON representation of a Conversation into a full object.

        Parameters
        ---------
        raw : JSON/dict
            The raw JSON

        Returns
        -------
        Conversation
            The conversation read from the raw JSON
        """
        convo = Conversation()
        for p in [get_constructor_by_platform(pjson['platform']).from_json(pjson) for pjson in raw]:
            convo.add_post(p)
        return convo

[docs]    def get_sources(self):
        """
        Returns the originating (non-reply) posts included in this conversation.

        Returns
        -------
        set(UID)
            The set of unique IDs of posts that originate conversation (are not replies)
        """
        return {uid for uid, post in self._posts.items() if not {rid for rid in post.reply_to if rid in self._posts}}

[docs]    def filter(self, by_langs=None, min_chars=0, before=None, after=None, by_tags=None, by_platform=None, by_author=None):
        """
        Returns the set of post UIDs that meet the parameterized criteria

        Parameters
        ---------
        by_langs : set(str)
            The desired language codes to be retained. (Default: None)
        min_chars : int
            The minimum number of characters a post should have. (Default: 0)
        before : datetime.datetime
            The earliest datetime desired. (Default: None)
        after : datetime.datetime
            The latest datetime desired. (Default: None)
        by_tags : set(str)
            The required tags. (Default: None)
        by_platform : set(str)
            A set of string names of platforms that should be retained
        by_author : str
            An author

        Returns
        -------
        set(hashable)
            Set of UIDs
        """
        drop = set()
        keep = set(self.posts.keys())
        for uid, post in self._posts.items():
            if by_author is not None and post.author != by_author:
                drop.add(uid)
                continue

            if len(post.text) < min_chars:
                drop.add(uid)
                continue

            if by_langs and post.lang not in by_langs:
                drop.add(uid)
                continue

            if before and (post.created_at is None or post.created_at >= before):
                drop.add(uid)
                continue
            if after and (post.created_at is None or post.created_at <= after):
                drop.add(uid)
                continue

            if by_tags and by_tags != (by_tags & post.tags):
                drop.add(uid)
                continue

            if by_platform and post.platform not in by_platform:
                drop.add(uid)
                continue

        keep -= drop
        return keep

[docs]    def time_order(self):
        """
        Returns a time series of the UIDs of posts within this Conversation.

        Returns
        -------
        list(UID)
            The list of UIDs of the posts in the conversation, in temporal order
        """
        try:
            return sorted(self._posts.keys(), key=lambda k: self._posts[k].created_at)
        except TypeError:
            return []

[docs]    def text_stream(self):
        """
        Returns the text of the Conversation as a single stream.
        If timestamps are available, text will appear in temporal order.

        Returns
        -------
        list(str)
            The text of the conversation, by post, in temporal order (if available)
        """
        order = self.time_order()
        if order:
            return [self._posts[uid].text for uid in order]
        else:
            return [self._posts[uid].text for uid in self._posts]

[docs]    def redact(self, assign_ints=True):
        """
        Redacts user information from the conversation.

        Parameters
        ----------
        assign_ints : bool
            If True, assigns a unique integer to each user such the user will be referred to as `USER><d+>`
            Otherwise, all user redactions will become a `USER` token.

        Returns
        -------
        None
        """
        rd = {}
        for uid in self._posts:
            for user in self._posts[uid].get_mentions():
                if user not in rd:
                    rd[user] = f'USER{len(rd)}' if assign_ints else 'USER'

        for uid in self._posts:
            self._posts[uid].redact(rd)

[docs]    def get_ancestors(self, uid, include_post=False):
        """
        Returns the ancestor posts/path for post `uid`.

        Parameters
        ----------
        uid : Hashable
            The unique identifier of desired post

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of ancestor posts
        """
        if 'ancestors' in self._relation_map and uid in self._relation_map['ancestors']:
            pids = self._relation_map['ancestors'][uid]
            filt_ps = {pid: self.posts[pid] for pid in pids}
        else:
            # get parents
            ps = self.get_parents(uid)
            filt_ps = dict(ps.posts)

            # for each parent, add its ancestors
            for pid in ps.posts:
                for xid in self.get_ancestors(pid).posts:
                    if xid not in filt_ps:
                        filt_ps[xid] = self.posts[xid]

            self._relation_map['ancestors'][uid] = set(filt_ps.keys())

        ancestors = Conversation(posts=filt_ps, convo_id=self.convo_id + '-' + str(uid) + '-ancestors')

        if include_post:
            ancestors.add_post(self.posts[uid])

        return ancestors

[docs]    def get_descendants(self, uid, include_post=False):
        """
        Returns the descendant sub-tree for post `uid`.

        Parameters
        ----------
        uid : Hashable
            The unique identifier of desired post

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of descendant posts
        """
        if 'descendant' in self._relation_map and uid in self._relation_map['descendant']:
            pids = self._relation_map['descendant'][uid]
            filt_ps = {pid: self.posts[pid] for pid in pids}
        else:
            # get children
            ps = self.get_children(uid)
            filt_ps = dict(ps.posts)

            # for each child, add its descendants
            for pid in ps.posts:
                for xid in self.get_descendants(pid).posts:
                    if xid not in filt_ps:
                        filt_ps[xid] = self.posts[xid]

            self._relation_map['descendant'][uid] = set(filt_ps.keys())

        descendants = Conversation(posts=filt_ps, convo_id=self.convo_id + '-' + str(uid) + '-descendant')

        if include_post:
            descendants.add_post(self.posts[uid])

        return descendants

[docs]    def get_parents(self, uid, include_post=False):
        """
        Returns the parent(s) of a post specified by `uid`.

        Parameters
        ----------
        uid : Hashable
            The unique identifier of desired post

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of parent posts
        """
        if 'parents' in self._relation_map and uid in self._relation_map['parents']:
            pids = self._relation_map['parents'][uid]
            filt_ps = {pid: self.posts[pid] for pid in pids}
        else:
            filt_ps = {pid: post for pid, post in self.posts.items() if pid in self.posts[uid].reply_to}
            self._relation_map['parents'][uid] = set(filt_ps.keys())

        cx = Conversation(posts=filt_ps, convo_id=self.convo_id + '-' + str(uid) + '-parents')

        if include_post:
            cx.add_post(self.posts[uid])

        return cx

[docs]    def get_children(self, uid, include_post=False):
        """
        Returns the children of a post specified by `uid`.

        Parameters
        ----------
        uid : Hashable
            The unique identifier of desired post

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of children posts
        """
        if 'children' in self._relation_map and uid in self._relation_map['children']:
            pids = self._relation_map['children'][uid]
            filt_ps = {pid: self.posts[pid] for pid in pids}
        else:
            filt_ps = {pid: post for pid, post in self.posts.items() if uid in self.posts[pid].reply_to}
            self._relation_map['children'][uid] = set(filt_ps.keys())

        cx = Conversation(posts=filt_ps, convo_id=self.convo_id + '-' + str(uid) + '-children')

        if include_post:
            cx.add_post(self.posts[uid])

        return cx

[docs]    def get_siblings(self, uid, include_post=False):
        """
        Returns the siblings of a post specified by `uid`.
        Siblings are the child posts of this post's parent posts.

        Parameters
        ----------
        uid : Hashable
            The unique identifier of desired post

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of sibling posts
        """
        if 'siblings' in self._relation_map and uid in self._relation_map['siblings']:
            pids = self._relation_map['siblings'][uid]
            siblings = Conversation(posts={pid: self.posts[pid] for pid in pids}, convo_id=self.convo_id + '-' + str(uid) + '-siblings')
        else:
            # just caches the parent IDs
            self.get_parents(uid)

            siblings = Conversation(convo_id=self.convo_id + '-' + str(uid) + '-siblings')
            for pid in self._relation_map['parents'][uid]:
                siblings += self.get_children(pid)

            self._relation_map['siblings'][uid] = set(siblings.posts.keys())

        if uid in siblings.posts and not include_post:
            siblings.remove_post(uid)

        if include_post and uid not in siblings.posts:
            siblings.add_post(self.posts[uid])

        return siblings

[docs]    def get_before(self, uid, include_post=False):
        """
        Returns the collection of posts in this conversation that were created before the post
        with UID `uid`

        Parameters
        ----------
        uid : Hashable
            The UID of the post that is the pivot

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of posts posted before uid

        Raises
        ------
        KeyError
            When `uid` is not in the Conversation
        """
        if 'before' in self._relation_map and uid in self._relation_map['before']:
            pids = self._relation_map['before'][uid]
            filt_posts = {pid: self.posts[pid] for pid in pids}
        else:
            filt_posts = {pid: self.posts[pid] for pid in self.filter(before=self._posts[uid].created_at)}
            self._relation_map['before'][uid] = set(filt_posts.keys())

        cx = Conversation(posts=filt_posts, convo_id=self.convo_id + '-' + str(uid) + '-before')

        if include_post:
            cx.add_post(self.posts[uid])

        return cx

[docs]    def get_after(self, uid, include_post=False):
        """
        Returns the collection of posts in this conversation that were created after the post
        with UID `uid`

        Parameters
        ----------
        uid : Hashable
            The UID of the post that is the pivot

        include_post : bool
            Whether the post should be included in returned collection. Default: False

        Returns
        -------
        Conversation
            The collection of posts posted after uid

        Raises
        ------
        KeyError
            When `uid` is not in the Conversation
        """
        if 'after' in self._relation_map and uid in self._relation_map['after']:
            pids = self._relation_map['after'][uid]
            filt_posts = {pid: self.posts[pid] for pid in pids}
        else:
            filt_posts = {pid: self.posts[pid] for pid in self.filter(after=self._posts[uid].created_at)}
            self._relation_map['after'][uid] = set(filt_posts.keys())

        cx = Conversation(posts=filt_posts, convo_id=self.convo_id + '-' + str(uid) + '-after')

        if include_post:
            cx.add_post(self.posts[uid])

        return cx