# -*- coding: utf-8 -*-
__author__ = 'vanxkr.com'
import sqlite3
db_name = 'test.db'
sql_drop_url = '''DROP TABLE IF EXISTS new_url;
DROP TABLE IF EXISTS old_url;'''
sql_create_new_url = '''CREATE TABLE IF NOT EXISTS new_urls (
id INTEGER PRIMARY KEY ASC AUTOINCREMENT
UNIQUE
NOT NULL,
url VARCHAR (1000) UNIQUE
NOT NULL ON CONFLICT IGNORE
);'''
sql_create_old_url = '''CREATE TABLE IF NOT EXISTS old_urls (
id INTEGER PRIMARY KEY ASC AUTOINCREMENT
UNIQUE
NOT NULL,
url VARCHAR (1000) UNIQUE
NOT NULL ON CONFLICT IGNORE
);'''
class vanxkr_uvm(object):
def __init__(self):
self.__cache_get = []
self.__current_get_num = 0 # 记录取出数量
self.__cache_get_num = 100 # 缓冲池 每__cache_new_num条url存一次数据库
self.__cache_put = []
self.__cache_put_num = 1000 # 缓冲池 每__cache_old_num条url存一次数据库
self._create_table()
self._refresh_get_cache()
def __del__(self):
self._save_to_urls('new_urls', self.__cache_get[self.__current_get_num:])
self._save_to_urls('old_urls', self.__cache_get[:self.__current_get_num])
self.__cache_get = []
def _create_table(self):
db = sqlite3.connect(db_name)
cursor = db.cursor()
cursor.execute(sql_create_new_url)
cursor.execute(sql_create_old_url)
db.commit()
db.close()
def _save_to_urls(self, table_name, urls):
db = sqlite3.connect(db_name)
cursor = db.cursor()
save_list = []
for i in urls: # 筛选 urls
sql = 'SELECT count(*) FROM %s WHERE url = "%s";' % (table_name, i)
cursor.execute(sql) # 查询是否在 table_name 的里
if(cursor.fetchall()[0][0]): # 存在
continue
else: # 不存在 就要存储
save_list.append(i)
if save_list:
sql = 'INSERT INTO %s (url) VALUES ' % table_name
for i in save_list:
sql += '("%s"),' % i
sql = sql[:-1] + ';'
try:
cursor.execute(sql) # 存入 table_name
except Exception as e: # 出错
print(e)
print(sql)
else: # 存入 table_name 成功
db.commit()
db.close()
def _refresh_get_cache(self):
self._save_to_urls('old_urls', self.__cache_get)
self.__cache_get = []
self._refresh_put_cache()
sql = 'SELECT url FROM new_urls LIMIT %s;' % self.__cache_get_num
db = sqlite3.connect(db_name)
cursor = db.cursor()
cursor.execute(sql)
self.__cache_get = list(i[0] for i in cursor.fetchall())
# 回删 已取出的数据
sql = 'delete from new_urls where id in (select id from new_urls order by id limit %s);' % self.__cache_get_num
cursor.execute(sql)
db.commit()
db.close()
self.__current_get_num = 0 # 重置
def _refresh_put_cache(self):
tmp = list(set(self.__cache_get)^set(self.__cache_put)) # 取出去的 和 拿进来的 的 差集
self.__cache_put = []
db = sqlite3.connect(db_name)
cursor = db.cursor()
save_list = []
for i in tmp: # 筛选 new_urls
sql_old = 'SELECT count(*) FROM old_urls WHERE url = "%s";' % i
cursor.execute(sql_old) # 查询是否在 old_urls 里
if cursor.fetchall()[0][0]: # 存在
continue
else: # 不存在 就要存储
save_list.append(i)
db.close()
self._save_to_urls('new_urls', save_list)
def reset_db(self):
db = sqlite3.connect(db_name)
cursor = db.cursor()
cursor.execute(sql_drop_url)
db.close()
self._create_table()
def get_url(self):
if self.__cache_get:
ret_url = self.__cache_get[self.__current_get_num] # 取出一个新的url
self.__current_get_num += 1
if(self.__current_get_num == self.__cache_get_num\
or self.__current_get_num == len(self.__cache_get)): # 是否用完了
self._refresh_get_cache() # 更新缓冲池
return ret_url;
else:
self._refresh_get_cache()
return 'null'
def put_urls(self, urls):
self.__cache_put.extend(urls)
if (0 == len(self.__cache_get) # 获取缓存区是空的
or self.__current_get_num == len(self.__cache_get)): # 获取缓存区已用完
self._refresh_get_cache()
if len(self.__cache_put) >= self.__cache_put_num: # 缓冲池满了
self._refresh_put_cache() # 清空缓冲池
def get_history_num(self):
sql = 'SELECT count(*) FROM old_urls;'
db = sqlite3.connect(db_name)
cursor = db.cursor()
cursor.execute(sql)
return cursor.fetchall()[0][0]
python爬虫url管理器[url_version_manage.py]
可以请我喝杯咖啡吗QAQ~
本文作者:vanxkr
本文链接:http://www.vanxkr.com/2018/8/python-url-version-manage
版权声明:本博客所有文章除特别声明外,均采用CC BY-NC-SA 3.0许可协议。转载请注明出处!
0 条评论