Source code for pyconversations.reader.chan

import json
from glob import glob

from tqdm import tqdm

from ..convo import Conversation
from ..message import ChanPost
from .base import BaseReader


[docs]class ChanReader(BaseReader):

    """
    Reader class for reading and converting raw 4chan data
    """

[docs]    @staticmethod
    def read(path_pattern, ld=True):
        """
        Function for reading an entire file/directory of conversations.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether or not language detection should be activated. (Default: True)

        Raises
        ------
        NotImplementedError
        """
        raise NotImplementedError

[docs]    @staticmethod
    def iter_read(path_pattern, ld=True):
        """
        Function for iteratively reading an entire file/directory of conversations.
        Currently expects a `path_pattern` that points to a directory of JSON files
        enumerated from 00 to 99.

        Parameters
        ----------
        path_pattern : str
            The path to file or directory containing Conversation data
        ld : bool
            Whether or not language detection should be activated. (Default: True)

        Yields
        ------
        2-tuple(int, Conversation)
            A tuple containing which chunk (in 0..99) this Conversation originated from as well as a Conversation segment.
        """
        for chunk in range(100):
            print(f'Parsing chunk {chunk+1}/100...')

            convo = Conversation()
            for f in glob(path_pattern + f'{chunk:02d}.json'):
                for post in tqdm(json.load(open(f)).values()):
                    px = ChanPost.parse_raw(post, lang_detect=ld)
                    if px:
                        convo.add_post(px)

            yield chunk, convo.segment()