CUNY-CL · kylebgorman · Jan 30, 2021 · Jan 17, 2021 · Jan 22, 2021 · Jan 23, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -139,6 +139,7 @@ Unreleased
 -   Handled Cantonese for scraping. (\#277)
 -   Added exclusion for reconstructions. (\#302)
 -   Added Vietnamese contour tone grouping test in `tests/test_config.py` (\#308)
+-   Added restart functionality. (\#340)
 
 #### Changed
 

diff --git a/data/README.md b/data/README.md
@@ -41,11 +41,12 @@
 | [TSV](tsv/bul_phonetic.tsv) | bul | Bulgarian | Bulgarian | True | Phonetic | 6,377 |
 | [TSV](tsv/bur_phonemic.tsv) | bur | Burmese | Burmese | False | Phonemic | 4,636 |
 | [TSV](tsv/bur_phonemic_filtered.tsv) | bur | Burmese | Burmese | False | Phonemic_filtered | 4,631 |
+| [TSV](tsv/yue_phonemic.tsv) | yue | Yue Chinese | Cantonese | False | Phonemic | 87,961 |
 | [TSV](tsv/crx_phonemic.tsv) | crx | Carrier | Carrier | False | Phonemic | 175 |
 | [TSV](tsv/cat_phonemic.tsv) | cat | Catalan; Valencian | Catalan | True | Phonemic | 55,829 |
 | [TSV](tsv/ceb_phonemic.tsv) | ceb | Cebuano | Cebuano | True | Phonemic | 326 |
 | [TSV](tsv/nya_phonemic.tsv) | nya | Nyanja | Chichewa | True | Phonemic | 823 |
-| [TSV](tsv/cmn_hani_phonemic.tsv) | cmn | Mandarin Chinese | Chinese (Han) | False | Phonemic | 125,901 |
+| [TSV](tsv/cmn_hani_phonemic.tsv) | cmn | Mandarin Chinese | Chinese (Han) | False | Phonemic | 133,686 |
 | [TSV](tsv/cho_phonemic.tsv) | cho | Choctaw | Choctaw | True | Phonemic | 112 |
 | [TSV](tsv/nci_phonemic.tsv) | nci | Classical Nahuatl | Classical Nahuatl | True | Phonemic | 820 |
 | [TSV](tsv/nci_phonetic.tsv) | nci | Classical Nahuatl | Classical Nahuatl | True | Phonetic | 1,396 |
@@ -253,7 +254,7 @@
 | [TSV](tsv/rum_phonemic.tsv) | rum | Romanian; Moldavian; Moldovan | Romanian | True | Phonemic | 4,108 |
 | [TSV](tsv/rum_phonetic.tsv) | rum | Romanian; Moldavian; Moldovan | Romanian | True | Phonetic | 6,394 |
 | [TSV](tsv/rum_phonetic_filtered.tsv) | rum | Romanian; Moldavian; Moldovan | Romanian | True | Phonetic_filtered | 6,286 |
-| [TSV](tsv/rus_phonetic.tsv) | rus | Russian | Russian | True | Phonetic | 402,483 |
+| [TSV](tsv/rus_phonetic.tsv) | rus | Russian | Russian | True | Phonetic | 402,600 |
 | [TSV](tsv/san_phonemic.tsv) | san | Sanskrit | Sanskrit | False | Phonemic | 6,841 |
 | [TSV](tsv/san_phonetic.tsv) | san | Sanskrit | Sanskrit | False | Phonetic | 673 |
 | [TSV](tsv/srd_phonemic.tsv) | srd | Sardinian | Sardinian | True | Phonemic | 216 |

diff --git a/data/languages_summary.tsv b/data/languages_summary.tsv
@@ -39,11 +39,12 @@ bul_phonemic_filtered.tsv	bul	Bulgarian	Bulgarian	True	Phonemic_filtered	31782
 bul_phonetic.tsv	bul	Bulgarian	Bulgarian	True	Phonetic	6377
 bur_phonemic.tsv	bur	Burmese	Burmese	False	Phonemic	4636
 bur_phonemic_filtered.tsv	bur	Burmese	Burmese	False	Phonemic_filtered	4631
+yue_phonemic.tsv	yue	Yue Chinese	Cantonese	False	Phonemic	87961
 crx_phonemic.tsv	crx	Carrier	Carrier	False	Phonemic	175
 cat_phonemic.tsv	cat	Catalan; Valencian	Catalan	True	Phonemic	55829
 ceb_phonemic.tsv	ceb	Cebuano	Cebuano	True	Phonemic	326
 nya_phonemic.tsv	nya	Nyanja	Chichewa	True	Phonemic	823
-cmn_hani_phonemic.tsv	cmn	Mandarin Chinese	Chinese (Han)	False	Phonemic	125901
+cmn_hani_phonemic.tsv	cmn	Mandarin Chinese	Chinese (Han)	False	Phonemic	133686
 cho_phonemic.tsv	cho	Choctaw	Choctaw	True	Phonemic	112
 nci_phonemic.tsv	nci	Classical Nahuatl	Classical Nahuatl	True	Phonemic	820
 nci_phonetic.tsv	nci	Classical Nahuatl	Classical Nahuatl	True	Phonetic	1396
@@ -251,7 +252,7 @@ pan_guru_phonemic.tsv	pan	Panjabi	Punjabi (Gurmukhi)	False	Phonemic	139
 rum_phonemic.tsv	rum	Romanian; Moldavian; Moldovan	Romanian	True	Phonemic	4108
 rum_phonetic.tsv	rum	Romanian; Moldavian; Moldovan	Romanian	True	Phonetic	6394
 rum_phonetic_filtered.tsv	rum	Romanian; Moldavian; Moldovan	Romanian	True	Phonetic_filtered	6286
-rus_phonetic.tsv	rus	Russian	Russian	True	Phonetic	402483
+rus_phonetic.tsv	rus	Russian	Russian	True	Phonetic	402600
 san_phonemic.tsv	san	Sanskrit	Sanskrit	False	Phonemic	6841
 san_phonetic.tsv	san	Sanskrit	Sanskrit	False	Phonetic	673
 srd_phonemic.tsv	srd	Sardinian	Sardinian	True	Phonemic	216

diff --git a/data/src/scrape.py b/data/src/scrape.py
@@ -6,12 +6,10 @@
 import json
 import logging
 import os
-import time
 import re
 
 from typing import Any, Dict, FrozenSet, Iterator
 
-import requests
 import wikipron  # type: ignore
 
 from data.src.codes import (
@@ -49,48 +47,21 @@ def _call_scrape(
     phones_set: FrozenSet[str] = None,
     tsv_filtered_path: str = "",
 ) -> None:
-    for unused_retries in range(10):
-        with open(tsv_path, "w", encoding="utf-8") as source:
-            try:
-                scrape_results = wikipron.scrape(config)
-                # Given phones, opens up a second tsv for scraping.
-                if phones_set:
-                    with open(
-                        tsv_filtered_path, "w", encoding="utf-8"
-                    ) as source_filtered:
-                        for (word, pron) in scrape_results:
-                            line = f"{word}\t{pron}"
-                            if _filter(word, pron, phones_set):
-                                print(line, file=source_filtered)
-                            print(line, file=source)
-                else:
-                    for (word, pron) in scrape_results:
-                        print(f"{word}\t{pron}", file=source)
-                return
-            except (
-                requests.exceptions.Timeout,
-                requests.exceptions.ConnectionError,
-            ):
-                logging.info(
-                    "Exception detected while scraping: %r, %r, %r",
-                    lang_settings["key"],
-                    tsv_path,
-                    tsv_filtered_path,
-                )
-                # Pauses execution for 10 min.
-                time.sleep(600)
-    # Log and remove TSVs for languages that failed.
-    logging.info(
-        "Failed to scrape %r with 10 retries (%s)",
-        lang_settings["key"],
-        lang_settings,
-    )
-    # Checks if second TSV was opened.
-    try:
-        os.remove(tsv_filtered_path)
-    except OSError:
-        pass
-    os.remove(tsv_path)
+    with open(tsv_path, "w", encoding="utf-8") as source:
+        scrape_results = wikipron.scrape(config)
+        # Given phones, opens up a second TSV for scraping.
+        if phones_set:
+            with open(
+                tsv_filtered_path, "w", encoding="utf-8"
+            ) as source_filtered:
+                for (word, pron) in scrape_results:
+                    line = f"{word}\t{pron}"
+                    if _filter(word, pron, phones_set):
+                        print(line, file=source_filtered)
+                    print(line, file=source)
+        else:
+            for (word, pron) in scrape_results:
+                print(f"{word}\t{pron}", file=source)
 
 
 def _build_scraping_config(