用Scrapy编写一组app版本监控的爬虫

首先创建Mysql数据库、表、字段、如下:

CREATE DATABASE jiankong_db DEFAULT CHARACTER SET utf8 COLLATE utf8_general_ci;
CREATE TABLE `jiankong_db`.`app_version` (
`id` int(10) unsigned  NOT NULL  AUTO_INCREMENT,
`version` text,
`url` text,
`date` timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP,
PRIMARY KEY (`id`)
) ENGINE=MyISAM DEFAULT CHARSET=utf8;

修改配置文件settings.py

#自定义USER_AGENT
USER_AGENT = 'Mozilla/7.0 (compatible; Baiduspider/3.5; +http://www.baidu.com/search/spider.html)'

#日志配置
LOG_ENABLED = True
LOG_ENCODING = 'utf-8'
LOG_FILE = 'log.txt'
LOG_LEVEL = 'INFO'
LOG_STDOUT = True

#配置使用Pipelines
ITEM_PIPELINES = {
'pachong.pipelines.PachongPipeline': 300,
}

爬虫类的回调函数 Spiders/AppVersion.py

import scrapy
#from 项目名称.items import 项目Item类名
from pachong.items import PachongItem
class AppversionSpider(scrapy.Spider):
    name = "AppVersion" #定义项目名称
    allowed_domains = ["itunes.apple.com"]
    start_urls = (
    "https://itunes.apple.com/cn/app/jian-li-cai-cai-fu-guan-li/id987830667", #app1,
    "https://itunes.apple.com/cn/app/yin-ke-li-cai-tou-zi-li-cai/id879768943", #app2
    )

    def parse(self, response):
        item = PachongItem() #创建item.py类实例
        item['version'] = response.xpath("//span[@itemprop='softwareVersion']/text()").extract()[0]
        #.encode("utf-8")
        item['url'] = response.url
        #item['date'] = time.time() #time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time())) 时间戳转换函数
        #print u"@输出信息开始@"
        #print item['version']
        #print u"@输出信息结束@"
        yield item

配置返回字段映射 items.py

import scrapy
class PachongItem(scrapy.Item):
    version = scrapy.Field() #定义版本字段
    url = scrapy.Field() #定义软件url版本
    #date = scrapy.Field() #定义抓取当前版本的时间

配置返回处理 pipelines.py

import MySQLdb
import smtplib
from email.mime.text import MIMEText
class PachongPipeline(object):
    def connect(self):
        #建立数据库连接
        conn = MySQLdb.connect(
        host = 'localhost',
        user = 'root',
        passwd = 'root',
        db = 'jiankong_db',
        port = 3306,
        charset='utf8'
        )
        return conn

    #发送邮件函数send_mail(收件,标题,内容)
    def send_mail(self,shoujian,title,body):
        # 设置服务器,用户名、密码以及邮箱的后缀
        mail_user = "610358898"
        mail_pass="填写密码"
        mail_postfix="qq.com"
        me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
        msg = MIMEText(body, 'html', 'utf-8')
        msg['Subject'] = title
        #msg['to'] = shoujian
        try:
            mail = smtplib.SMTP()
            mail.connect("smtp.qq.com")#配置SMTP服务器
            mail.login(mail_user,mail_pass)
            mail.sendmail(me,shoujian, msg.as_string())
            mail.close()
            print u"send mail success!"
        except Exception, e:
            print str(e)
            print u"send mail exit!"

        #回调函数返回
    def process_item(self, item, spider):
        conn = self.connect()
        cur = conn.cursor()
        print u"测试最新抓到的数据:",item['version'],item['url']
        #查询是否有存在的记录 sql模版
        sqli_select = "SELECT * FROM app_version WHERE version = %s AND url = %s"
        #添加记录 sql模版
        sqli="INSERT INTO app_version (id,version,url,date) VALUES(NULL,%s,%s,CURRENT_TIMESTAMP)"
        result = cur.execute(sqli_select,(item['version'],item['url']))
        print u"返回结果:",result
        if result print u"准备插入了!"
        body = "VERSION:" + str(item['version']) + "<br>" + "URL:" + str(item['url'])
        """
        #发邮件,如果发不出则用自己的邮件方法处理
        mailer.send(
        to=["admin@0535code.com"],
        subject="IOS APP 版本监控邮件通知",
        body= body,
        )
        """
        cur.execute(sqli,(item['version'],item['url']))
        conn.commit() #提交事物
        conn.close() #关闭数据库连接
        #关闭数据库后,发邮件通知
        self.send_mail(shoujian = "454690484@qq.com", title = "IOS APP版本更新通知!" , body = body)

########测试了Scrapy自带的发邮件的配置了半天没用了,官网写的很简单,配起来各种问题,发现问题比较少的爬虫,在不熟悉Scrapy的情况下,用纯Python可能会更快一些,也重写了一个Python原声版的,而且会一直在内存中跑,遇到问题则会自动发邮件,代码如下########

import sys,os,requests,time,logging,re,random,lxml,smtplib
from bs4 import BeautifulSoup
from email.mime.text import MIMEText
import MySQLdb
#解决输出问题
reload(sys)
sys.setdefaultencoding( "utf-8" )
#设置头信息,用于抓取页面的内容
headers={       "Host":"itunes.apple.com",
"User-Agent":"Mozilla/7.0 (compatible; Baiduspider/3.5; +http://www.baidu.com/search/spider.html)",
"Accept":"*/*",
"Accept-Language":"zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding":"gzip, deflate",
"Content-Type":"application/x-www-form-urlencoded; charset=UTF-8",
"X-Requested-With":"XMLHttpRequest",
"Connection":"keep-alive"
}
#定义访问cookies实例
sid = requests.session()
#定义爬虫数组
urls = [
"https://itunes.apple.com/cn/app/jian-li-cai-cai-fu-guan-li/id987830667",
"https://itunes.apple.com/cn/app/yin-ke-li-cai-tou-zi-li-cai/id879768943",
]
#定义数据库ORM字典
results = {}
#建立数据库连接
conn = MySQLdb.connect(
host = 'localhost',
user = 'root',
passwd = 'root',
db = 'jiankong_db',
port = 3306,
charset='utf8'
)
cur=conn.cursor()
sqli_select = "SELECT * FROM app_version WHERE version = %s AND url = %s" #查询是否有存在的记录 sql模版
sqli="INSERT INTO app_version (id,version,url,date) VALUES(NULL,%s,%s,CURRENT_TIMESTAMP)" #添加记录 sql模版

#获取版本数据函数
def mouthpiece(url):
    res_html = sid.get(url).content
    html = BeautifulSoup(res_html,'lxml')
    results = html.find("span", {"itemprop":"softwareVersion"}).get_text()
    #print results
    return results

#开始爬取,主函数
def start_app(sleep):
    #遍历爬取url版本
    for url in urls:
    results['url'] = url
    results['version'] = mouthpiece(url)
    #print results
    results_sql = cur.execute(sqli_select,(results['version'],results['url'])) #查询结果,看数据库中是否有值
    print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),u"数据库查询返回结果:",results_sql
    if results_sql<1 :
        print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),u"有更新了!"
        cur.execute(sqli,(results['version'],results['url']))
        conn.commit()
        #保存后发邮件通知
        body = "URL:" + results['url'] + "version:" + results['version']
        send_mail(shoujian = "454690484@qq.com" , title = "APP版本更新通知!" , body = body)
    else:
        print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()),u"没有更新!",results['url'],results['version']
        time.sleep(sleep)
        start_app(sleep)

#发送邮件函数send_mail(收件,标题,内容)
def send_mail(shoujian,title,body):
    # 设置服务器,用户名、密码以及邮箱的后缀
    mail_user = '610358898@qq.com'
    mail_pass="写自己邮箱密码"
    mail_postfix="qq.com"
    me=mail_user+"<"+mail_user+"@"+mail_postfix+">"
    msg = MIMEText(body, 'html', 'utf-8')
    msg['Subject'] = title
    #msg['to'] = shoujian
    try:
        mail = smtplib.SMTP()
        mail.connect("smtp.qq.com")#配置SMTP服务器
        mail.login(mail_user,mail_pass)
        mail.sendmail(me,shoujian, msg.as_string())
        mail.close()
        print u"send mail success!"
    except Exception, e:
        print str(e)
        print u"send mail exit!"

#send_mail(shoujian = "admin@0535code.com",title = "test",body = u"测试内容")

def c_main():
    try:
        print time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) + u" ------开始解析执行监控..."
        start_app(6)
        #每10分钟执行一次
    except:
        #打印错误日志
        print "Unexpected error:",sys.exc_info()
        logging.basicConfig(filename='error.log')
        logging.exception("Exception Logged")
        time.sleep(10)#如果遇到问题睡眠10秒钟
        #start_app(600)
        send_mail( shoujian = "admin@0535code.com",title = "IOS APP版本监控BUG反馈",body = str(sys.exc_info()) )
        c_main()

#MAIN方法
if __name__ == "__main__":
    c_main()

这是后面增加代码功能整理的格式、如果有问题、欢迎吃瓜群众留言

发表评论

电子邮件地址不会被公开。 必填项已用*标注