From 5f593316eb6914da1fda68dc1f1bcba6f3c0472b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B1=A4=E8=B6=8A?= <1196656373@qq.com> Date: Fri, 2 Sep 2022 08:32:12 +0800 Subject: [PATCH] Add comments and pydoc --- Src/medi_fetch/main.py | 280 +++++++++++++++++++++++++------------------------ 1 file changed, 141 insertions(+), 139 deletions(-) diff --git a/Src/medi_fetch/main.py b/Src/medi_fetch/main.py index fc88780..ded9066 100644 --- a/Src/medi_fetch/main.py +++ b/Src/medi_fetch/main.py @@ -1,139 +1,141 @@ -# Medicine Infomation Spider -# Spider: requests -# parser: XPath -# database: ? -# Modules required: requests,lxml,(database connection) -# -*- coding: utf-8 -*- -# author:Shen tongle in DASE_ECNU 2021 - -import requests -from lxml import etree -import time -import sqlite3 - - -class Medicine: - def __init__(self): - self.head = "https://www.315jiage.cn/" - self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" - self.headers = { - "User-agent": self.user - } - self.flag = 0 - self.failed_num = 0 - self.conn = sqlite3.connect('medicine.db') - self.cur = self.conn.cursor() - - - def get_data(self, type, page): - self.flag = 0 - url = f"https://www.315jiage.cn/mc{type}p{page}.aspx" - response = requests.get(url, headers=self.headers, timeout=30) - self.content = response.content.decode("utf-8") - html = etree.HTML(self.content) - if html.xpath("//head/title/text()")[0][0] == "您": - self.flag += 1 - return - results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""") - print(f"Browsing page {page}") - count = 0 - for each_id in results: - try: - new_url = self.head + each_id - resp = requests.get(new_url, headers=self.headers) - content = resp.content.decode("utf-8") - ehtml = etree.HTML(content) - - info = {} - - name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0] - info["产品名称"] = name - - base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""") - titles = ['规格:', '剂型:', '包装单位:', '批准文号:', '生产厂家:', '条形码:', '主治疾病:'] - for each in base_info: - if each in titles: - if each == '规格:' or each == '剂型:': - for every_info in base_info[base_info.index(each) + 1:]: - if every_info in titles: - break - else: - info[each[:-1]] = every_info[:-3] - break - elif each == '主治疾病:': - for every_info in base_info[base_info.index(each) + 1:]: - if every_info in titles: - break - else: - illness = "" - info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split()) - break - else: - for every_info in base_info[base_info.index(each) + 1:]: - if every_info in titles: - break - else: - info[each[:-1]] = every_info - break - info["主治疾病"]="测试" - info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0] - - info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]""")) - - temp = ehtml.xpath("""//ul[@class="property"]//text()""") - contents = [] - for content in temp: - if content not in contents: - contents.append(content) - - contents.pop(contents.index(" ")) - - # info["说明书"] = contents - SMS = "" - - info["说明书"] = SMS.join(contents) - - count += 1 - print(f"Saving infomation {page}-{count}") - time.sleep(0.65) - # All infomation has been downloaded and preprocessed! - # ----------------------------------------- - print(info) - # try: - # self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)", - # (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"), - # info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"), - # info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"), - # info.get("是否处方", "NULL"), info.get("说明书", "NULL"))) - # self.conn.commit() - # except(sqlite3.Error): - # self.conn.rollback() - # print("saving error!") - # 'info' is going to be saved in one database - # Writing saving codes below...... - except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError): - print("Failed to download data!") - self.failed_num += 1 - continue - - - - -def main(): - medicine = Medicine() - types = [118, 119, 131] - a = [131] - for type in a: - for i in range(0,2500): - medicine.get_data(type, i + 1) - time.sleep(0.1) - if medicine.flag == 1: - break - print("下载失败的条目数量:", medicine.failed_num) - medicine.conn.close() - - -if __name__ == "__main__": - main() - -# failed in 429, 430 is missing should be complemented +# Medicine Infomation Spider +# Spider: requests +# parser: XPath +# database: SQLite +# Modules required: requests,lxml,(database connection) +# -*- coding: utf-8 -*- +# author:Shen tongle in DASE_ECNU 2021 + +import requests +from lxml import etree +import time +import sqlite3 + + +class Medicine: + def __init__(self): + self.head = "https://www.315jiage.cn/" + self.user = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36" + self.headers = { + "User-agent": self.user + } + self.flag = 0 + self.failed_num = 0 + self.conn = sqlite3.connect('medicine.db') + self.cur = self.conn.cursor() + + + def get_data(self, type, page): + ''' + Obtain data from web, and store into the SQLite database + ''' + self.flag = 0 + url = f"https://www.315jiage.cn/mc{type}p{page}.aspx" + response = requests.get(url, headers=self.headers, timeout=30) + self.content = response.content.decode("utf-8") + html = etree.HTML(self.content) + if html.xpath("//head/title/text()")[0][0] == "您": + self.flag += 1 + return + results = html.xpath("""//div[@class="col-2"]/a[@target="_blank"]/@href""") + print(f"Browsing page {page}") + count = 0 + for each_id in results: + try: + new_url = self.head + each_id + resp = requests.get(new_url, headers=self.headers) + content = resp.content.decode("utf-8") + ehtml = etree.HTML(content) + + info = {} + + name = ehtml.xpath("""//span[@itemprop="name"]/text()""")[0] + info["产品名称"] = name + + base_info = ehtml.xpath("""//div[@class = "block-info-prop text-oneline"]//text()""") + titles = ['规格:', '剂型:', '包装单位:', '批准文号:', '生产厂家:', '条形码:', '主治疾病:'] + for each in base_info: + if each in titles: + if each == '规格:' or each == '剂型:': + for every_info in base_info[base_info.index(each) + 1:]: + if every_info in titles: + break + else: + info[each[:-1]] = every_info[:-3] + break + elif each == '主治疾病:': + for every_info in base_info[base_info.index(each) + 1:]: + if every_info in titles: + break + else: + illness = "" + info[each[:-1]] = illness.join(every_info.replace("\xa0", "").split()) + break + else: + for every_info in base_info[base_info.index(each) + 1:]: + if every_info in titles: + break + else: + info[each[:-1]] = every_info + break + info["主治疾病"]="测试" + info["批准文号"] = ehtml.xpath("//td/div/u/a/text()")[0] + + info["是否处方"] = bool(ehtml.xpath("""//td/div/span[@class="cRed"]""")) + + temp = ehtml.xpath("""//ul[@class="property"]//text()""") + contents = [] + for content in temp: + if content not in contents: + contents.append(content) + + contents.pop(contents.index(" ")) + + # info["说明书"] = contents + SMS = "" + + info["说明书"] = SMS.join(contents) + + count += 1 + print(f"Saving infomation {page}-{count}") + time.sleep(0.65) + # All infomation has been downloaded and preprocessed! + # ----------------------------------------- + print(info) + # The following code only use once, when obtaining raw data of the medicine info is required. Uncomment it when using. + # try: + # self.cur.execute("INSERT INTO medicine_chengyao VALUES(?,?,?,?,?,?,?,?,?,?)", + # (info.get("产品名称", "NULL"), info.get("规格", "NULL"), info.get("剂型", "NULL"), + # info.get("包装单位", "NULL"), info.get("生产厂家", "NULL"), + # info.get("条形码", "NULL"), info.get("主治疾病", "NULL"), info.get("批准文号", "NULL"), + # info.get("是否处方", "NULL"), info.get("说明书", "NULL"))) + # self.conn.commit() + # except(sqlite3.Error): + # self.conn.rollback() + # print("saving error!") + # 'info' is going to be saved in one database + # Writing saving codes below...... + except(IndexError,requests.HTTPError,TimeoutError,requests.exceptions.ConnectionError): + print("Failed to download data!") + self.failed_num += 1 + continue + + + + +def main(): + medicine = Medicine() + types = [118, 119, 131] + a = [131] + for type in a: + for i in range(0,2500): + medicine.get_data(type, i + 1) + time.sleep(0.1) + if medicine.flag == 1: + break + print("下载失败的条目数量:", medicine.failed_num) + medicine.conn.close() + + +if __name__ == "__main__": + main() \ No newline at end of file