Source code for pyconversations.reader.twitter

import json
from glob import glob

from tqdm import tqdm

from ..convo import Conversation
from ..message import Tweet
from .base import BaseReader


[docs]class QuoteReader(BaseReader):

    """
    A reader specifically designed to read JSONs of Quote tweet archives.
    """

[docs]    @staticmethod
    def read(path_pattern, ld=True):
        """
        Reads an entire directory of quote tweet JSONLine files,
        segments them into disjoint conversations,
        and returns the conversations.

        Parameters
        ----------
        path_pattern : str
            The path to the directory
        ld : bool
            Whether to activate language detection (Default: True)

        Returns
        -------
        list(Conversation)
            A list of disjoint conversations
        """
        convo = Conversation()
        for f in sorted(glob(f'{path_pattern}*.json')):
            print(f'Ingesting: {f}')
            with open(f) as fp:
                for line in tqdm(fp.readlines()):
                    for x in Tweet.parse_raw(json.loads(line), lang_detect=ld):
                        convo.add_post(x)
            print(f'In-memory posts: {len(convo.posts)}')

        return convo.segment()

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True):
        """
        Function for creating a conversation reading iterator.
        Will read and parse part of a file/directory, yielding segments as queried.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether to activate language detection (Default: True)

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError


[docs]class ThreadsReader(BaseReader):

    """
    This is a custom Twitter "Threads" Reader.
    May be deprecated to adopt new Twitter reply functionality.
    """

[docs]    @staticmethod
    def read(path_pattern):
        """
        Function for reading an entire file/directory of conversations.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True):
        """
        Function for creating a conversation reading iterator.
        Will read and parse part of a file/directory, yielding segments as queried.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether to activate language detection (Default: True)

        Yields
        ------
        2-tuple(str, list(Conversation))
            The string ID of the threaded discussion and a list of the disjoint Conversations identified within it
        """
        for f in sorted(glob(f'{path_pattern}*tweets.json')):
            convo = Conversation()
            src = f.split('_')[-1].replace('-tweets.json', '')
            tweets = json.load(open(f))
            for tid, tweet in tweets.items():
                xs = Tweet.parse_raw(tweet)
                for x in xs:
                    convo.add_post(x)

            yield src, convo.segment()