brew

                         Brew patentcity dataset

General functioning: Stream text blobs | process | print json blobs to stdout

beta: entities only
v1: entities & relationship

`grind(path, max_workers=10)` ¶

Stream texts in path and return json objects to stdout. Files are expected to be patent texts named after the publication_number of the patent (e.g. US-12345-A.txt).

Parameters:

Name	Type	Description	Default
`path`	`str`	data path, wildcard allowed	required
`max_workers`	`int`	max number of workers	`10`

Output:

{"publication_number": str, "text": str, "hash_id": str}

Usage:

patencity brew v1.grind "data/US/*.txt"
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py

@app.command(name="v1.grind")
def grind(path: str, max_workers: int = 10):
    """Stream texts in `path` and return json objects to stdout.
    Files are expected to be patent texts named after the
    publication_number of the patent (e.g. US-12345-A.txt).


    Arguments:
        path: data path, wildcard allowed
        max_workers: max number of workers

    **Output**:
        ```json
        {"publication_number": str, "text": str, "hash_id": str}
        ```

    **Usage:**
        ```shell
        patencity brew v1.grind "data/US/*.txt"
        # Nb: if the file is large, you can split and zip
        ```
    """
    files = iglob(path)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(_get_blob, files)

`topping(file, config_file=None, max_workers=10)` ¶

Stream data in file and return enriched v1 json object to stdout.

Parameters:

Name	Type	Description	Default
`file`	`str`	file path	required
`config_file`	`str`	topping config file	`None`
`max_workers`		max number of workers	`10`

Output:

{"publication_number": str, "patentee": List[dict], "hash_id": str,
"model_ents": str, "model_rels": str, "git_sha": str}

Usage:

mv data/US/entrel_uspatentxx.jsonl data/US/entrel_uspatentxx.jsonl.tmp
patencity v1.topping --config-file configs/top_xxpatentxx.yaml "data/US/entrel_uspatentxx.jsonl.tmp"  # pylint: disable=line-too-long
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py

@app.command(name="v1.topping")
def topping(file: str, config_file: str = None, max_workers=10):
    """Stream data in `file` and  return enriched v1 json object to stdout.

    Arguments:
        file: file path
        config_file: topping config file
        max_workers: max number of workers

    **Output**:
        ```json
        {"publication_number": str, "patentee": List[dict], "hash_id": str,
        "model_ents": str, "model_rels": str, "git_sha": str}
        ```

    **Usage:**
        ```shell
        mv data/US/entrel_uspatentxx.jsonl data/US/entrel_uspatentxx.jsonl.tmp
        patencity v1.topping --config-file configs/top_xxpatentxx.yaml "data/US/entrel_uspatentxx.jsonl.tmp"  # pylint: disable=line-too-long
        # Nb: if the file is large, you can split and zip
        ```

    """
    with open(config_file, "r") as config_file_:
        config = yaml.load(config_file_, Loader=yaml.FullLoader)
    for k, v in config["cit_code"].items():  # pylint: disable=invalid-name
        config["cit_code"].update({k: json.loads(open(v, "r").read())})

    with open(file, "r") as lines:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(_topping, lines, repeat(config))

`v1(path, model, rel_config, max_char=9999, batch_size=1000, inDelim='|')` ¶

Stream json objects in path and return json v1 objects to stdout.

Parameters:

Name	Type	Description	Default
`path`	`str`	data path, wildcard allowed	required
`model`	`str`	model path	required
`rel_config`	`str`	relationship resolution config file path	required
`max_char`	`int`	max char considered for entity extraction	`9999`
`batch_size`	`int`	size of the data batch passed to spaCy model	`1000`
`inDelim`	`str`	in delimiter	`'\|'`

Output:

{"publication_number": str, "patentee": List[dict], "hash_id": str,
"model_ents": str, "model_rels": str, "git_sha": str}

Usage:

patencity brew v1 "data/US/uspatentxx*.jsonl"
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py

@app.command()
def v1(  # pylint: disable=invalid-name
    path: str,
    model: str,
    rel_config: str,
    max_char: int = 9999,
    batch_size: int = 1000,
    inDelim: str = "|",  # pylint: disable=invalid-name
):
    """
    Stream json objects in `path` and return json v1 objects to stdout.

    Arguments:
        path: data path, wildcard allowed
        model: model path
        rel_config: relationship resolution config file path
        max_char: max char considered for entity extraction
        batch_size: size of the data batch passed to spaCy model
        inDelim: in delimiter

    **Output**:
        ```json
        {"publication_number": str, "patentee": List[dict], "hash_id": str,
        "model_ents": str, "model_rels": str, "git_sha": str}
        ```

    **Usage:**
        ```shell
        patencity brew v1 "data/US/uspatentxx*.jsonl"
        # Nb: if the file is large, you can split and zip
        ```
    """
    nlp = spacy.load(model)
    with open(rel_config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.FullLoader)
    nlp.add_pipe("relation_extractor", config={"config": config}, last=True)

    files = glob(path)
    for file in files:
        publication_numbers = list(
            (json.loads(line)["publication_number"] for line in open(file, "r"))
        )
        hash_ids = list((json.loads(line)["hash_id"] for line in open(file, "r")))
        with open(file, "r") as lines:
            texts = (json.loads(line)["text"][:max_char] for line in lines)
            docs = nlp.pipe(texts, batch_size=batch_size)
            for i, doc in enumerate(docs):
                publication_number = publication_numbers[i]
                hash_id = hash_ids[i]
                patentees = [
                    {k: clean_text(v, inDelim) for k, v in patentee.items()}
                    for patentee in doc._.patentees
                ]
                row = {
                    "publication_number": publication_number,
                    "patentee": patentees,
                    "hash_id": hash_id,
                    "model_ents": model,
                    "model_rels": rel_config,
                    "git_sha": sha,
                }
                typer.echo(json.dumps(row))

brew

grind(path, max_workers=10) ¶

topping(file, config_file=None, max_workers=10) ¶

v1(path, model, rel_config, max_char=9999, batch_size=1000, inDelim='|') ¶

`grind(path, max_workers=10)` ¶

`topping(file, config_file=None, max_workers=10)` ¶

`v1(path, model, rel_config, max_char=9999, batch_size=1000, inDelim='|')` ¶