Source code for pyconversations.message.twitter

import re
from datetime import datetime

from .base import UniMessage


[docs]class Tweet(UniMessage):

    """
    Twitter post object with additional Twitter-specific features
    """

    MENTION_REGEX = r'(^|[^@\w])@(\w{1,15})\b'
    CLASS_STR = 'Tweet'

    def __init__(self, **kwargs):

        kwargs['platform'] = 'Twitter'

        super(Tweet, self).__init__(**kwargs)

[docs]    @staticmethod
    def parse_datestr(x):
        """
        Static method that specifies how to convert the native datetime string
        into a a Python datetime object.

        Parameters
        ----------
        x
            The raw datetime string
        """
        return datetime.strptime(x, '%a %b %d %H:%M:%S +0000 %Y')

[docs]    def get_mentions(self):
        """
        Uses Twitter specific regex to attempt to identify
        user mentions within the comment text.

        Returns a set of usernames.
        """
        # twitter mention regex
        names = re.findall(self.MENTION_REGEX, self.text)
        names = [name[1] for name in names]

        return super(Tweet, self).get_mentions() | set(names)

[docs]    @staticmethod
    def parse_raw(data, lang_detect=False):
        """
        Static method that must be implemented by all non-abstract child classes.
        Concrete implementations should specify how to parse the raw data into this object.
        Returns a list of Twitter posts.

        Parameters
        ----------
        data
            The raw data to be pre-processed.
        lang_detect
            A boolean which specifies whether language detection should be activated. (Default: False)
        """
        cons_vals = {
            'reply_to': set(),
            'lang_detect': lang_detect
        }
        out = []

        ignore_keys = {
            'id_str', 'truncated', 'display_text_range', 'entities', 'source',
            'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str',
            'in_reply_to_screen_name', 'geo', 'coordinates', 'place', 'contributors',
            'is_quote_status', 'retweet_count', 'favorite_count', 'favorited',
            'retweeted', 'metadata', 'extended_entities', 'possibly_sensitive',
            'quoted_status_id_str', 'quoted_status_permalink', 'withheld_in_countries',
            'in_reply_to_status_created_at', 'possibly_sensitive_appealable', 'scopes',
            'withheld_scope', 'withheld_copyright',
            'filter_level', 'trends',
        }
        for key, value in data.items():
            if key in ignore_keys:
                continue

            if key == 'created_at':
                cons_vals['created_at'] = Tweet.parse_datestr(value)
            elif key == 'id':
                cons_vals['uid'] = value
            elif key == 'full_text':
                cons_vals['text'] = value
            elif key == 'text' and 'text' not in cons_vals:
                cons_vals['text'] = value
            elif key == 'lang':
                cons_vals['lang'] = value
            elif key == 'in_reply_to_status_id':
                cons_vals['reply_to'].add(value)
            elif key == 'quoted_status_id':
                cons_vals['reply_to'].add(value)
            elif key == 'user':
                cons_vals['author'] = value['screen_name']
            elif key == 'quoted_status':
                out.extend(Tweet.parse_raw(value))
            else:
                raise KeyError(f'Tweet:parse_raw - Unrecognized key: {key} --> {value}')

        # Do entities last
        if 'entities' in data:
            ignore_keys = {
                'hashtags', 'symbols', 'user_mentions', 'trends'
            }
            for key, value in data['entities'].items():
                if key in ignore_keys:
                    continue

                if key == 'media':
                    for v in value:
                        cons_vals['text'] = re.sub(v['url'], v['display_url'], cons_vals.get('text', ''))
                elif key == 'urls':
                    for v in value:
                        cons_vals['text'] = re.sub(v['url'], v['expanded_url'], cons_vals.get('text', ''))
                else:
                    raise KeyError(f'Tweet:parse_raw (entities) - Unrecognized key: {key} --> {value}')

        if 'text' in cons_vals and cons_vals['text']:
            out.append(Tweet(**cons_vals))

        return out