Skip to content

brew

                         Brew patentcity dataset

General functioning: Stream text blobs | process | print json blobs to stdout

  • beta: entities only
  • v1: entities & relationship

grind(path, max_workers=10)

Stream texts in path and return json objects to stdout. Files are expected to be patent texts named after the publication_number of the patent (e.g. US-12345-A.txt).

Parameters:

Name Type Description Default
path str

data path, wildcard allowed

required
max_workers int

max number of workers

10

Output:

{"publication_number": str, "text": str, "hash_id": str}

Usage:

patencity brew v1.grind "data/US/*.txt"
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
@app.command(name="v1.grind")
def grind(path: str, max_workers: int = 10):
    """Stream texts in `path` and return json objects to stdout.
    Files are expected to be patent texts named after the
    publication_number of the patent (e.g. US-12345-A.txt).


    Arguments:
        path: data path, wildcard allowed
        max_workers: max number of workers

    **Output**:
        ```json
        {"publication_number": str, "text": str, "hash_id": str}
        ```

    **Usage:**
        ```shell
        patencity brew v1.grind "data/US/*.txt"
        # Nb: if the file is large, you can split and zip
        ```
    """
    files = iglob(path)
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        executor.map(_get_blob, files)

topping(file, config_file=None, max_workers=10)

Stream data in file and return enriched v1 json object to stdout.

Parameters:

Name Type Description Default
file str

file path

required
config_file str

topping config file

None
max_workers

max number of workers

10

Output:

{"publication_number": str, "patentee": List[dict], "hash_id": str,
"model_ents": str, "model_rels": str, "git_sha": str}

Usage:

mv data/US/entrel_uspatentxx.jsonl data/US/entrel_uspatentxx.jsonl.tmp
patencity v1.topping --config-file configs/top_xxpatentxx.yaml "data/US/entrel_uspatentxx.jsonl.tmp"  # pylint: disable=line-too-long
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
@app.command(name="v1.topping")
def topping(file: str, config_file: str = None, max_workers=10):
    """Stream data in `file` and  return enriched v1 json object to stdout.

    Arguments:
        file: file path
        config_file: topping config file
        max_workers: max number of workers

    **Output**:
        ```json
        {"publication_number": str, "patentee": List[dict], "hash_id": str,
        "model_ents": str, "model_rels": str, "git_sha": str}
        ```

    **Usage:**
        ```shell
        mv data/US/entrel_uspatentxx.jsonl data/US/entrel_uspatentxx.jsonl.tmp
        patencity v1.topping --config-file configs/top_xxpatentxx.yaml "data/US/entrel_uspatentxx.jsonl.tmp"  # pylint: disable=line-too-long
        # Nb: if the file is large, you can split and zip
        ```

    """
    with open(config_file, "r") as config_file_:
        config = yaml.load(config_file_, Loader=yaml.FullLoader)
    for k, v in config["cit_code"].items():  # pylint: disable=invalid-name
        config["cit_code"].update({k: json.loads(open(v, "r").read())})

    with open(file, "r") as lines:
        with ThreadPoolExecutor(max_workers=max_workers) as executor:
            executor.map(_topping, lines, repeat(config))

v1(path, model, rel_config, max_char=9999, batch_size=1000, inDelim='|')

Stream json objects in path and return json v1 objects to stdout.

Parameters:

Name Type Description Default
path str

data path, wildcard allowed

required
model str

model path

required
rel_config str

relationship resolution config file path

required
max_char int

max char considered for entity extraction

9999
batch_size int

size of the data batch passed to spaCy model

1000
inDelim str

in delimiter

'|'

Output:

{"publication_number": str, "patentee": List[dict], "hash_id": str,
"model_ents": str, "model_rels": str, "git_sha": str}

Usage:

patencity brew v1 "data/US/uspatentxx*.jsonl"
# Nb: if the file is large, you can split and zip

Source code in patentcity/brew.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
@app.command()
def v1(  # pylint: disable=invalid-name
    path: str,
    model: str,
    rel_config: str,
    max_char: int = 9999,
    batch_size: int = 1000,
    inDelim: str = "|",  # pylint: disable=invalid-name
):
    """
    Stream json objects in `path` and return json v1 objects to stdout.

    Arguments:
        path: data path, wildcard allowed
        model: model path
        rel_config: relationship resolution config file path
        max_char: max char considered for entity extraction
        batch_size: size of the data batch passed to spaCy model
        inDelim: in delimiter

    **Output**:
        ```json
        {"publication_number": str, "patentee": List[dict], "hash_id": str,
        "model_ents": str, "model_rels": str, "git_sha": str}
        ```

    **Usage:**
        ```shell
        patencity brew v1 "data/US/uspatentxx*.jsonl"
        # Nb: if the file is large, you can split and zip
        ```
    """
    nlp = spacy.load(model)
    with open(rel_config, "r") as config_file:
        config = yaml.load(config_file, Loader=yaml.FullLoader)
    nlp.add_pipe("relation_extractor", config={"config": config}, last=True)

    files = glob(path)
    for file in files:
        publication_numbers = list(
            (json.loads(line)["publication_number"] for line in open(file, "r"))
        )
        hash_ids = list((json.loads(line)["hash_id"] for line in open(file, "r")))
        with open(file, "r") as lines:
            texts = (json.loads(line)["text"][:max_char] for line in lines)
            docs = nlp.pipe(texts, batch_size=batch_size)
            for i, doc in enumerate(docs):
                publication_number = publication_numbers[i]
                hash_id = hash_ids[i]
                patentees = [
                    {k: clean_text(v, inDelim) for k, v in patentee.items()}
                    for patentee in doc._.patentees
                ]
                row = {
                    "publication_number": publication_number,
                    "patentee": patentees,
                    "hash_id": hash_id,
                    "model_ents": model,
                    "model_rels": rel_config,
                    "git_sha": sha,
                }
                typer.echo(json.dumps(row))