Source code for pyconversations.message.reddit

import re
from datetime import datetime

from .base import UniMessage


[docs]class RedditPost(UniMessage):
    """
    Reddit post object with additional Reddit-specific features
    """

    MENTION_REGEX = r'(^|[^\w])/?u/([A-Za-z0-9_-]+)\b'
    CLASS_STR = 'RedditPost'

    def __init__(self, **kwargs):

        kwargs['platform'] = 'Reddit'

        super(RedditPost, self).__init__(**kwargs)

[docs]    @staticmethod
    def parse_datestr(x):
        """
        Static method that specifies how to convert the native datetime string
        into a a Python datetime object.

        Parameters
        ----------
        x : str
            The raw datetime string

        Returns
        -------
        datetime.datetime
            The parsed datetime
        """
        return datetime.fromtimestamp(float(x))

[docs]    def get_mentions(self):
        """
        Uses Reddit specific regex to attempt to identify
        user mentions within the comment text.

        Returns a set of usernames.

        Returns
        -------
        set(str)
            The set of extracted usernames
        """
        # Reddit mention regex
        names = re.findall(self.MENTION_REGEX, self.text)
        names = [n[1] for n in names]

        return super(RedditPost, self).get_mentions() | set(names)

[docs]    @staticmethod
    def parse_raw(data, lang_detect=False):
        """
        Static method that must be implemented by all non-abstract child classes.
        Concrete implementations should specify how to parse the raw data into this object.

        Parameters
        ----------
        data : JSON/dict
            The raw data to be pre-processed.
        lang_detect : bool
            A boolean which specifies whether language detection should be activated. (Default: False)

        Returns
        -------
        list(RedditPost)
            The parsed post
        """
        if 'kind' in data and 'data' in data:  # and data['kind'] == 'Listing':
            if data['kind'] == 'Listing':
                return [x for child in data['data']['children'] for x in
                        RedditPost.parse_raw(child['data'], lang_detect=lang_detect)]
            else:
                return [x for x in RedditPost.parse_raw(data['data'], lang_detect=lang_detect)]

        if type(data) == list:
            return [RedditPost.parse_raw(x, lang_detect=lang_detect) for x in data if type(x) == dict]

        post_cons = {
            'reply_to':    set(),
            'lang_detect': lang_detect
        }

        ignore_keys = {
            'approved_at_utc',
            'approved_by',
            'archived',
            # 'author',
            'author_cakeday',
            'author_flair_background_color',
            'author_flair_css_class',
            'author_flair_richtext',
            'author_flair_template_id',
            'author_flair_text',
            'author_flair_text_color',
            'author_flair_type',
            # 'author_name',
            'banned_at_utc',
            'banned_by',
            # 'body',
            'body_html',
            'can_gild',
            'can_mod_post',
            'category',
            # 'children',
            'clicked',
            'collapsed',
            'collapsed_reason',
            'content_categories',
            'contest_mode',
            'controversiality',
            'count',
            # 'created',
            # 'created_utc',
            'delta',
            'depth',
            'distinguished',
            'domain',
            'downs',
            'edited',
            'gilded',
            'hidden',
            'hide_score',
            # 'id',
            'is_crosspostable',
            'is_meta',
            'is_original_content',
            'is_reddit_media_domain',
            'is_self',
            'is_submitter',
            'is_video',
            'kind',
            'likes',
            'link_flair_background_color',
            'link_flair_css_class',
            'link_flair_richtext',
            'link_flair_template_id',
            'link_flair_text',
            'link_flair_text_color',
            'link_flair_type',
            'link_id',
            'locked',
            'media',
            'media_embed',
            'media_only',
            'mod_note',
            'mod_reason_by',
            'mod_reason_title',
            'mod_reports',
            # 'name',
            'no_follow',
            'num_comments',
            'num_crossposts',
            'num_reports',
            'over_18',
            # 'parent_id',
            'parent_whitelist_status',
            'permalink',
            'pinned',
            'post_categories',
            'post_hint',
            'preview',
            'previous_visits',
            'pwls',
            'quarantine',
            'removal_reason',
            # 'replies',
            'report_reasons',
            'rte_mode',
            'saved',
            'score',
            'score_hidden',
            'secure_media',
            'secure_media_embed',
            # 'selftext',
            'selftext_html',
            'send_replies',
            'spoiler',
            'stickied',
            'subreddit',
            'subreddit_id',
            'subreddit_name_prefixed',
            'subreddit_subscribers',
            'subreddit_type',
            'suggested_sort',
            'thumbnail',
            'thumbnail_height',
            'thumbnail_width',
            # 'title',
            'ups',
            'upvote_ratio',
            'url',
            'user_reports',
            'view_count',
            'violated_rule',
            'visited',
            'whitelist_status',
            'wls'
        }

        out = []
        for key, value in data.items():
            if key in ignore_keys:
                continue

            if key == 'author_name' or key == 'author':
                post_cons['author'] = value
            elif key == 'body' or key == 'selftext':
                post_cons['text'] = post_cons['text'] + '\n' + value if 'text' in post_cons else value
            elif key == 'title':
                post_cons['text'] = value + '\n' + post_cons['text'] if 'text' in post_cons else value
            elif key == 'created':
                post_cons['created_at'] = RedditPost.parse_datestr(value)
            elif key == 'created_utc':
                if 'created' in data:
                    continue
                post_cons['created_at'] = RedditPost.parse_datestr(value)
            elif key == 'id':
                post_cons['uid'] = value
            elif key == 'name':
                if 'id' not in data:
                    post_cons['uid'] = value
            elif key == 'parent_id':
                if re.search(r't\d_', value):
                    value = re.sub(r't\d_', '', value)
                post_cons['reply_to'].add(value)
            elif key == 'replies' or key == 'children':
                if value:
                    out.extend(RedditPost.parse_raw(value, lang_detect=lang_detect))
            else:
                print(data.keys())
                raise KeyError(f'RedditPost::parse_raw - Unrecognized key: {key} --> {value}')

        if post_cons['uid'] != '_':
            out.append(RedditPost(**post_cons))
        return out

[docs]    @staticmethod
    def parse_rd(data, lang_detect=True):
        """
        Secondary method for parsing raw Reddit data

        Parameters
        ----------
        data : JSON/dict
            The raw data to be pre-processed.
        lang_detect : bool
            A boolean which specifies whether language detection should be activated. (Default: True)

        Returns
        -------
        RedditPost
            The parsed post
        """
        cons = {
            'lang_detect': lang_detect,
            'uid':         data['id'],  # 't3_' + data['id'],
            'author':      data['author'],
            'created_at':  RedditPost.parse_datestr(data['created_utc']),
            'tags':        {f'board={data["subreddit"]}'}
        }
        if data['type'] == 'comment':
            cons['text'] = data['body']
            pid = data['parent_id']
            for i in range(1, 6):
                pid = pid.replace(f't{i}_', '')
            cons['reply_to'] = {pid}
        elif data['type'] == 'submission':
            cons['text'] = data['title'] + ' ' + data['selftext']
            cons['reply_to'] = set()
        else:
            raise ValueError(f'RedditPost::parse_rd -- Unrecognized type: {data}')

        return RedditPost(**cons)