package comirva.web.crawling.agmis;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.TreeSet;
import java.util.Vector;

/* loaded from: input_file:comirva/web/crawling/agmis/GoldenRetriever.class */
public class GoldenRetriever extends Thread {
    public static final boolean CHECK_FOR_DIR_STRUCTURE = true;
    public static final int MAX_SKIP_URLS = 99;
    public static final boolean USE_PAGE_NO_RANGE_FILTER = false;
    public static final int MAX_PARALLEL_DOWNLOADS = 30;
    public static final long WAIT_BETWEEN_RETRIEVALS_FROM_SAME_HOST = 2000;
    public static final int EQUAL_HOST_LEVELS = 2;
    public static final int MAX_URLS_IN_CRAWL_LIST = 600000;
    private static long startTime;
    private static long lastMeasureTime;
    private static long retrievedURLs;
    private static final int MEASURE_INTERVAL = 100;
    private static GoldenRetriever[] threads;
    private int threadNo;
    private static BufferedWriter bwProcessedIdx;
    public static final File ROOT_DIR = new File("/media/AGMIS2/exalead_4th/M/");
    public static final File URL_FILE = new File("/media/AGMIS2/exalead_4th/crawling.txt");
    public static final File PROCESSED_IDX_FILE = new File("/media/AGMIS2/exalead_4th/processed_idx.txt");
    public static final File WGET = new File("wget");
    public static int START_OFFSET = 0;
    public static int END_OFFSET = 99;
    public static final int[] PAGE_NO_RANGE = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    public static Vector<RetrievalData> ri = new Vector<>();
    public static TreeSet<Integer> retrievedIdx = new TreeSet<>();
    public static DownloadControlDataVector hosts = new DownloadControlDataVector();
    public static Vector<Integer> currentlyDownloadingIdx = new Vector<>();
    public static Vector<URL> blacklistSites = new Vector<>();
    private static Process[] wgetProcesses = new Process[30];

    public GoldenRetriever(int i) {
        this.threadNo = i;
    }

    public static int doWaitFor(Process process) {
        int i = -1;
        try {
            InputStream inputStream = process.getInputStream();
            InputStream errorStream = process.getErrorStream();
            boolean z = false;
            while (!z) {
                while (inputStream.available() > 0) {
                    try {
                        new Character((char) inputStream.read());
                    } catch (IllegalThreadStateException e) {
                        Thread.currentThread();
                        Thread.sleep(50L);
                    }
                }
                while (errorStream.available() > 0) {
                    new Character((char) errorStream.read());
                }
                i = process.exitValue();
                z = true;
            }
        } catch (Exception e2) {
            System.err.println("doWaitFor(): unexpected exception - " + e2.getMessage());
        }
        return i;
    }

    @Override // java.lang.Thread, java.lang.Runnable
    public void run() {
        while (!ri.isEmpty()) {
            try {
                RetrievalData retrievalData = ri.get((int) Math.floor(Math.random() * ri.size()));
                while (true) {
                    if (!retrievedIdx.contains(retrievalData.getIndex()) && !currentlyDownloadingIdx.contains(retrievalData.getIndex()) && hosts.getTimeElapsed(retrievalData.getUrl().getHost()) >= WAIT_BETWEEN_RETRIEVALS_FROM_SAME_HOST) {
                        break;
                    }
                    int floor = (int) Math.floor(Math.random() * ri.size());
                    if (floor < ri.size()) {
                        retrievalData = ri.elementAt(floor);
                    }
                }
                if (ri.isEmpty()) {
                    return;
                }
                currentlyDownloadingIdx.addElement(retrievalData.getIndex());
                hosts.update(new DownloadControlData(retrievalData.getUrl().getHost()));
                createDirectoryStructure(retrievalData.getFile().getParentFile());
                System.out.println("Thread " + this.threadNo + " is retrieving from offset " + (retrievalData.getIndex().intValue() % 100) + " " + retrievalData.getUrl() + " (idx: " + retrievalData.getIndex() + ") into " + retrievalData.getFile().getAbsolutePath());
                Process exec = Runtime.getRuntime().exec(WGET + " --no-clobber --waitretry=20 --random-wait --no-cookies -e robots=on --user-agent=\"Mozilla/5.0\" -t 1 -T 5 -O " + retrievalData.getFile().getAbsolutePath() + " " + retrievalData.getUrl());
                wgetProcesses[this.threadNo] = exec;
                doWaitFor(exec);
                exec.destroy();
                if (retrievalData.getFile().exists() && retrievalData.getIndex() != null) {
                    retrievedIdx.add(retrievalData.getIndex());
                    bwProcessedIdx.append((CharSequence) (retrievalData.getIndex() + "\n"));
                    bwProcessedIdx.flush();
                    if (ri.contains(retrievalData)) {
                        ri.removeElement(retrievalData);
                    }
                    if (currentlyDownloadingIdx.contains(retrievalData.getIndex())) {
                        currentlyDownloadingIdx.removeElement(retrievalData.getIndex());
                    }
                    retrievedURLs++;
                }
                if (retrievedURLs % 100 == 0) {
                    int currentTimeMillis = ((int) (System.currentTimeMillis() - startTime)) / 1000;
                    int currentTimeMillis2 = ((int) (System.currentTimeMillis() - lastMeasureTime)) / 1000;
                    System.out.println(String.valueOf(retrievedURLs) + " URLs successfully retrieved in " + currentTimeMillis + " sec -> " + (((float) retrievedURLs) / currentTimeMillis) + " retrievals/sec");
                    System.out.println("The last 100 URLs were successfully retrieved in " + currentTimeMillis2 + " sec -> " + (100.0f / currentTimeMillis2) + " retrievals/sec");
                    lastMeasureTime = System.currentTimeMillis();
                    System.gc();
                }
            } catch (Exception e) {
                e.printStackTrace();
                return;
            }
        }
    }

    private static int skipLines(BufferedReader bufferedReader, int i) {
        int i2 = 0;
        while (i2 < i) {
            try {
                if (bufferedReader.readLine() == null) {
                    break;
                }
                i2++;
            } catch (Exception e) {
                e.printStackTrace();
            }
        }
        return i2;
    }

    private static int skipLinesRandomly(BufferedReader bufferedReader, int i) {
        int i2 = 0;
        try {
            int round = (int) Math.round(Math.random() * i);
            i2 = 0;
            while (i2 < round) {
                if (bufferedReader.readLine() == null) {
                    break;
                }
                i2++;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        return i2;
    }

    private static void startRetrieval() {
        fillCrawlList();
        threads = new GoldenRetriever[30];
        for (int i = 0; i < threads.length; i++) {
            threads[i] = new GoldenRetriever(i);
            threads[i].start();
        }
        CrawlListManager crawlListManager = new CrawlListManager(ri, hosts);
        crawlListManager.setPriority(4);
        crawlListManager.start();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public static void fillCrawlList() {
        String str;
        if (START_OFFSET > END_OFFSET) {
            System.out.println("START_OFFSET reached END_OFFSET! No more items will be added to the crawl list.");
            return;
        }
        try {
            int i = 0;
            BufferedReader bufferedReader = new BufferedReader(new FileReader(URL_FILE));
            System.out.println("Filling craw list with a set of URLs with offset " + START_OFFSET);
            int skipLines = 0 + skipLines(bufferedReader, START_OFFSET);
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    System.out.println(String.valueOf(i) + " URLs were added to the crawl list. The list now contains " + ri.size() + " items.");
                    return;
                }
                if (!retrievedIdx.contains(Integer.valueOf(skipLines))) {
                    int indexOf = readLine.indexOf(",");
                    String substring = readLine.substring(0, indexOf);
                    String substring2 = readLine.substring(indexOf + 1);
                    int indexOf2 = substring2.indexOf(",");
                    String substring3 = substring2.substring(0, indexOf2);
                    String substring4 = substring2.substring(indexOf2 + 1);
                    int indexOf3 = substring4.indexOf(",");
                    String substring5 = substring4.substring(0, indexOf3);
                    if (1 != 0) {
                        switch (substring5.length()) {
                            case 1:
                                str = "00" + substring5;
                                break;
                            case 2:
                                str = "0" + substring5;
                                break;
                            default:
                                str = substring5;
                                break;
                        }
                        String substring6 = substring4.substring(indexOf3 + 1);
                        String str2 = String.valueOf(ROOT_DIR.getAbsolutePath()) + "/" + substring + "/" + substring3.substring(0, 1) + "/" + substring3 + "/" + str + ".html";
                        URL url = new URL(substring6);
                        boolean z = false;
                        for (int i2 = 0; i2 < blacklistSites.size(); i2++) {
                            if (blacklistSites.elementAt(i2).getHost().equals(url.getHost())) {
                                z = true;
                            }
                        }
                        if (z) {
                            System.out.println("Excluding " + substring6 + " because host is blacklisted.");
                        } else {
                            ri.addElement(new RetrievalData(url, str2, skipLines));
                            i++;
                        }
                    }
                }
                skipLines += skipLines(bufferedReader, 99) + 1;
            }
        } catch (FileNotFoundException e) {
            e.printStackTrace();
        } catch (IOException e2) {
            e2.printStackTrace();
        }
    }

    public static void createDirectoryStructure(File file) {
        if (file == null) {
            return;
        }
        createDirectoryStructure(file.getParentFile());
        if (file.exists()) {
            return;
        }
        file.mkdir();
    }

    public static void main(String[] strArr) {
        int i = 0;
        int i2 = 0;
        startTime = System.currentTimeMillis();
        lastMeasureTime = startTime;
        try {
            blacklistSites.addElement(new URL("http://www.downtownmusicgallery.com"));
            blacklistSites.addElement(new URL("http://downtownmusicgallery.com"));
            PROCESSED_IDX_FILE.createNewFile();
            BufferedReader bufferedReader = new BufferedReader(new FileReader(URL_FILE));
            BufferedReader bufferedReader2 = new BufferedReader(new FileReader(PROCESSED_IDX_FILE));
            System.out.println("Scanning " + URL_FILE);
            while (bufferedReader.readLine() != null) {
                i++;
            }
            System.out.println("Total number of URLs that must be retrieved: " + i);
            bufferedReader.close();
            System.out.println("Reading information on already retrieved URLs from " + PROCESSED_IDX_FILE);
            while (true) {
                String readLine = bufferedReader2.readLine();
                if (readLine == null) {
                    break;
                }
                Integer num = new Integer(readLine);
                if (!retrievedIdx.contains(num)) {
                    retrievedIdx.add(num);
                    i2++;
                }
            }
            System.out.println("URLs already retrieved: " + i2);
            bufferedReader2.close();
            bwProcessedIdx = new BufferedWriter(new FileWriter(PROCESSED_IDX_FILE, true));
            startRetrieval();
            for (int i3 = 0; i3 < threads.length; i3++) {
                threads[i3].join();
            }
        } catch (IOException e) {
            e.printStackTrace();
        } catch (InterruptedException e2) {
            e2.printStackTrace();
        }
    }

    protected void finalize() {
        try {
            bwProcessedIdx.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        for (int i = 0; i < wgetProcesses.length; i++) {
            Process process = wgetProcesses[i];
            if (process != null) {
                doWaitFor(process);
                process.destroy();
            }
        }
    }
}
