Source code for pyconversations.reader.chan

import json
from glob import glob

from tqdm import tqdm

from ..convo import Conversation
from ..message import ChanPost
from .base import BaseReader


[docs]class ChanReader(BaseReader): """ Reader class for reading and converting raw 4chan data """
[docs] @staticmethod def read(path_pattern, ld=True): """ Function for reading an entire file/directory of conversations. Parameters ---------- path_pattern : str The path to file or directory containing Conversation data ld : bool Whether or not language detection should be activated. (Default: True) Raises ------ NotImplementedError """ raise NotImplementedError
[docs] @staticmethod def iter_read(path_pattern, ld=True): """ Function for iteratively reading an entire file/directory of conversations. Currently expects a `path_pattern` that points to a directory of JSON files enumerated from 00 to 99. Parameters ---------- path_pattern : str The path to file or directory containing Conversation data ld : bool Whether or not language detection should be activated. (Default: True) Yields ------ 2-tuple(int, Conversation) A tuple containing which chunk (in 0..99) this Conversation originated from as well as a Conversation segment. """ for chunk in range(100): print(f'Parsing chunk {chunk+1}/100...') convo = Conversation() for f in glob(path_pattern + f'{chunk:02d}.json'): for post in tqdm(json.load(open(f)).values()): px = ChanPost.parse_raw(post, lang_detect=ld) if px: convo.add_post(px) yield chunk, convo.segment()