Add comments and pydoc

3 years ago · 5f593316eb
--- a/Src/medi_fetch/main.py
+++ b/Src/medi_fetch/main.py
@ -1,139 +1,141 @@
 # Medicine Infomation Spider
 # Spider: requests
 # parser: XPath
 # database: ?
 # Modules required: requests,lxml,(database connection)
 # -*- coding: utf-8 -*-
 # author:Shen tongle in DASE_ECNU 2021

 import requests
 from lxml import etree
 import time
 import sqlite3


 class Medicine:
    def __init__(self):
        self.head = "https://www.315jiage.cn/"
        self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
        self.headers = {
            "User-agent": self.user
        }
        self.flag = 0
        self.failed_num = 0
        self.conn = sqlite3.connect('medicine.db')
        self.cur = self.conn.cursor()


    def get_data(self, type, page):
        self.flag = 0
        url = f"https://www.315jiage.cn/mc{type}p{page}.aspx"
        response = requests.get(url, headers=self.headers, timeout=30)
        self.content = response.content.decode("utf-8")
        html = etree.HTML(self.content)
        if html.xpath("//head/title/text()")[0][0] == "您":
            self.flag += 1
            return
        results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""")
        print(f"Browsing page {page}")
        count = 0
        for each_id in results:
            try:
                new_url = self.head + each_id
                resp = requests.get(new_url, headers=self.headers)
                content = resp.content.decode("utf-8")
                ehtml = etree.HTML(content)

                info = {}

                name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0]
                info["产品名称"] = name

                base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""")
                titles = ['规格：', '剂型：', '包装单位：', '批准文号：', '生产厂家：', '条形码：', '主治疾病：']
                for each in base_info:
                    if each in titles:
                        if each == '规格：' or each == '剂型：':
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    info[each[:-1]] = every_info[:-3]
                                    break
                        elif each == '主治疾病：':
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    illness = ""
                                    info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split())
                                    break
                        else:
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    info[each[:-1]] = every_info
                                    break
                info["主治疾病"]="测试"
                info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0]

                info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]"""))

                temp = ehtml.xpath("""//ul[@class="property"]//text()""")
                contents = []
                for content in temp:
                    if content not in contents:
                        contents.append(content)

                contents.pop(contents.index(" "))

                # info["说明书"] = contents
                SMS = ""

                info["说明书"] = SMS.join(contents)

                count += 1
                print(f"Saving infomation {page}-{count}")
                time.sleep(0.65)
                # All infomation has been downloaded and preprocessed!
                # -----------------------------------------
                print(info)
                # try:
                #     self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)",
                #                      (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"),
                #                       info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"),
                #                       info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"),
                #                       info.get("是否处方", "NULL"), info.get("说明书", "NULL")))
                #     self.conn.commit()
                # except(sqlite3.Error):
                #     self.conn.rollback()
                #     print("saving error!")
                # 'info' is going to be saved in one database
                # Writing saving codes below......
            except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError):
                print("Failed to download data!")
                self.failed_num += 1
                continue




 def main():
    medicine = Medicine()
    types = [118, 119, 131]
    a = [131]
    for type in a:
        for i in range(0,2500):
            medicine.get_data(type, i + 1)
            time.sleep(0.1)
            if medicine.flag == 1:
                break
    print("下载失败的条目数量：", medicine.failed_num)
    medicine.conn.close()


 if __name__ == "__main__":
    main()

 # failed in 429, 430 is missing should be complemented
 # Medicine Infomation Spider
 # Spider: requests
 # parser: XPath
 # database: SQLite
 # Modules required: requests,lxml,(database connection)
 # -*- coding: utf-8 -*-
 # author:Shen tongle in DASE_ECNU 2021

 import requests
 from lxml import etree
 import time
 import sqlite3


 class Medicine:
    def __init__(self):
        self.head = "https://www.315jiage.cn/"
        self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36"
        self.headers = {
            "User-agent": self.user
        }
        self.flag = 0
        self.failed_num = 0
        self.conn = sqlite3.connect('medicine.db')
        self.cur = self.conn.cursor()


    def get_data(self, type, page):
        '''
        Obtain data from web, and store into the SQLite database
        '''
        self.flag = 0
        url = f"https://www.315jiage.cn/mc{type}p{page}.aspx"
        response = requests.get(url, headers=self.headers, timeout=30)
        self.content = response.content.decode("utf-8")
        html = etree.HTML(self.content)
        if html.xpath("//head/title/text()")[0][0] == "您":
            self.flag += 1
            return
        results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""")
        print(f"Browsing page {page}")
        count = 0
        for each_id in results:
            try:
                new_url = self.head + each_id
                resp = requests.get(new_url, headers=self.headers)
                content = resp.content.decode("utf-8")
                ehtml = etree.HTML(content)

                info = {}

                name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0]
                info["产品名称"] = name

                base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""")
                titles = ['规格：', '剂型：', '包装单位：', '批准文号：', '生产厂家：', '条形码：', '主治疾病：']
                for each in base_info:
                    if each in titles:
                        if each == '规格：' or each == '剂型：':
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    info[each[:-1]] = every_info[:-3]
                                    break
                        elif each == '主治疾病：':
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    illness = ""
                                    info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split())
                                    break
                        else:
                            for every_info in base_info[base_info.index(each) + 1:]:
                                if every_info in titles:
                                    break
                                else:
                                    info[each[:-1]] = every_info
                                    break
                info["主治疾病"]="测试"
                info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0]

                info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]"""))

                temp = ehtml.xpath("""//ul[@class="property"]//text()""")
                contents = []
                for content in temp:
                    if content not in contents:
                        contents.append(content)

                contents.pop(contents.index(" "))

                # info["说明书"] = contents
                SMS = ""

                info["说明书"] = SMS.join(contents)

                count += 1
                print(f"Saving infomation {page}-{count}")
                time.sleep(0.65)
                # All infomation has been downloaded and preprocessed!
                # -----------------------------------------
                print(info)
                # The following code only use once, when obtaining raw data of the medicine info is required. Uncomment it when using.
                # try:
                #     self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)",
                #                      (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"),
                #                       info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"),
                #                       info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"),
                #                       info.get("是否处方", "NULL"), info.get("说明书", "NULL")))
                #     self.conn.commit()
                # except(sqlite3.Error):
                #     self.conn.rollback()
                #     print("saving error!")
                # 'info' is going to be saved in one database
                # Writing saving codes below......
            except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError):
                print("Failed to download data!")
                self.failed_num += 1
                continue




 def main():
    medicine = Medicine()
    types = [118, 119, 131]
    a = [131]
    for type in a:
        for i in range(0,2500):
            medicine.get_data(type, i + 1)
            time.sleep(0.1)
            if medicine.flag == 1:
                break
    print("下载失败的条目数量：", medicine.failed_num)
    medicine.conn.close()


 if __name__ == "__main__":
    main()