package com.cmes.spider.engine;

import com.cmes.spider.entity.FdaData;
import com.cmes.spider.service.FdaDataService;
import com.google.common.collect.Sets;
import lombok.extern.slf4j.Slf4j;
import org.assertj.core.util.Lists;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.stereotype.Component;

import java.io.IOException;
import java.util.List;
import java.util.Objects;
import java.util.Set;

@Slf4j
@Component
public class FdaJob {

    private final static String baseUrl = "https://www.drugfuture.com";
    @Autowired
    private FdaDataService fdaDataService;

    public void run() throws Exception {
        log.info("spider start");
//        List<String> chars = Lists.newArrayList("b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z");
//        List<String> errorPaths = Lists.newArrayList();
//        chars.parallelStream().forEach(
//                i->{
//                    Set<String> paths = null;
//                    try {
//                        paths = getLinkSet(i);
//                    } catch (Exception e) {
//                        log.error("error ~ {},E{}",i,e);
//                        errorPaths.add(i);
//                    }
//                    paths.forEach(path -> {
//                        try {
//                            catchUrl(path);
//                        } catch (Exception e) {
//                            log.error("error {}:",path,e);
//                            errorPaths.add(path);
//                        }
//                    });
//                }
//        );

//        errorPaths.forEach(System.out::println);
        List<String> all = Lists.newArrayList(
                "/fda/drug/dextrose-5-and-sodium-chloride-0-45-in-plastic-container.html",
                "/fda/drug/dextrose-5-and-sodium-chloride-0-9.html",
                "/fda/drug/dextrose-10-in-plastic-container.html",
                "/fda/drug/dextrose-5-and-sodium-chloride-0-45.html",
                "/fda/drug/dv.html",
                "/fda/drug/dextrose-5-in-plastic-container.html",
                "/fda/drug/dextrose-2-5-in-plastic-container.html",
                "/fda/drug/dextrose-25.html",
                "/fda/drug/dextrose-5-and-sodium-chloride-0-9-in-plastic-container.html",
                "/fda/drug/sodium-chloride-0-9.html",
                "/fda/drug/fluconazole-in-sodium-chloride-0-9.html",
                "/fda/drug/gentamicin-sulfate-in-sodium-chloride-0-9-in-plastic-container.html",
                "/fda/drug/sodium-chloride-0-45-in-plastic-container.html",
                "/fda/drug/sodium-chloride-0-9-in-plastic-container.html",
                "/fda/drug/levofloxacin-in-dextrose-5-in-plastic-container.html");
        List<String> errorList = Lists.newArrayList();
        for (String s : all) {
            try {

                catchUrl(s);
            } catch (Exception e) {
                errorList.add(s);
                log.error("error !~ {}", s, e);
            }
        }
        System.out.println(errorList);
        log.info("spider over");
    }


    public Set<String> getLinkSet(String word) throws Exception {
        JsoupEngine jsoupEngine = new JsoupEngine(baseUrl + "/fda/drugbrowse-" + word + ".html");
        log.info("current url :" + jsoupEngine.getUrl());
        Elements links = jsoupEngine.select("ul").select("a[href]");
        Set<String> allLinks = Sets.newHashSet();
        for (Element link : links) {
            allLinks.add(link.attr("href"));
        }
        log.info("current page has {} data", allLinks.size());
        return allLinks;
    }

    public void catchUrl(String path) throws Exception {

        Set<String> extraPaths = Sets.newHashSet();
        log.info("ready catch, current path is : {}, ", path);
        JsoupEngine jsoupEngine = new JsoupEngine(baseUrl + path);
        log.info(jsoupEngine.getTitle());
        String pageNum = jsoupEngine.select("div[class=nav]").select("div").get(2).select("span[class=number]").get(0).text();

        if (Integer.valueOf(pageNum) > 1) {
            log.warn("current path has extra data: {}, page num:{}", path, pageNum);
            Elements pages = jsoupEngine.select("div[class=nav]").select("div").select("a[href]");
            for (Element page : pages) {
                if (Objects.equals(page.text(), "下一页")) {
                    extraPaths.add(page.attr("href"));
                }
            }
        }

        Elements tables = jsoupEngine.select("body").select("table");
        Element realTable = null;
        for (Element table : tables) {
            if (table.childrenSize() == 2) {
                realTable = table;
            }
        }
        Elements trs = realTable.select("tbody").select("tr");
        List<FdaData> fdaDataList = Lists.newArrayList();
        for (Element tr : trs) {

            Elements tds = tr.select("td");
            String link = tr.select("a[href]").attr("href");
            FdaData data = FdaData.builder()
                    .name(tds.get(0).text())
                    .applyNum(tds.get(1).text())
                    .productNum(tds.get(2).text())
                    .applyType(tds.get(3).text())
                    .activeIngredient(tds.get(4).text())
                    .formRoute(tds.get(5).text())
                    .specs(tds.get(6).text())
                    .rld(tds.get(7).text())
                    .rs(tds.get(8).text())
                    .applyDate(tds.get(9).text())
                    .productDate(tds.get(10).text())
                    .applyPerson(tds.get(11).text())
                    .marketState(tds.get(12).text())
                    .link(link)
                    .build();
            fdaDataList.add(data);
        }
        fdaDataService.batchInsertIgnore(fdaDataList);
        for (String extraPath : extraPaths) {
            catchUrl(extraPath);
        }
        log.info("catch over :{}", path);

    }
}
