本站快讯
  • 未发布任何快讯哟,快去后台快讯发布吧

杰奇小说Pyspider脚本编写指南

作者 : 蓝大富 发布时间: 2020-02-11 文章热度:260 共9876个字,阅读需25分钟。 字体:
  • 文章介绍
  • 评价建议
  • import sys
    reload(sys)
    sys.setdefaultencoding('utf8')
    import copy
    import json
    import time,datetime
    import re
    from pyspider.libs.base_handler import *
    from pyquery import PyQuery as pq
    result_template = {
    "info_id":"", # 资讯信息编号(自增)
    "url":"", # 原文URL
    "title":"", # 标题
    "subheading":"", # 副标题
    "fetch_time":"",
    "pub_time":"", # 发布时间 文章内容中的发布时间,并非爬虫爬去到文章的时间
    "sort":"", # 分类接口 ?
    "summary":"", # 资讯信息摘要
    "content":"", #正文
    "persons":"", # 涉及到的人
    "companys":"", # 涉及到的公司
    "stocknames":"", # 涉及到的股票
    "stockcodes":"", # 涉及到的股票代码
    "industries":"", # 涉及的行业
    "sections":"", # 涉及的板块
    "others":"",
    "info_type":"", # 文章所属类型 公告 / 新闻
    "source":"", # 发布单位
    "info_channel":"", # 2级标题/频道及以下所有标题/频道。不同频道之间,使用下划线"_"连接,不包含"首页"及"正文"。
    "editor":"", #编辑者
    "keywords":"", # 文章自带关键词
    "datetime":"", # 文章采集时间
    "imageAttachment":"null", #图片附件
    "fileAttachment":"null", # 文件附件
    "html":"",
    }
    source_name = "中国金融网"
    source_list = [
    {
    "url": "http://www.cnfinance.cn/articles/?template=sample_397.html&page=%s",
    "source_channel": "新闻",
    },
    {
    "url": "http://www.financeun.com/articleList/1.shtml?page=%s",
    "source_channel": "焦点", "source_name": "中国金融网"
    }
    ]
    # headers=headers,
    headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36'
    }
    class Handler(BaseHandler):
    crawl_config = {
    }
    @every(minutes=2 * 60)
    def on_start(self):
    for source in source_list:
    url = source['url']
    source_channel = source['source_channel']
    for i in range(1,2):
    self.crawl(url % str(i),headers=headers, callback=self.index_page, save=source)
    @config(age=1)
    def index_page(self, response):
    for each in response.doc('dl.dl_artListB dt a').items():
    href = each.attr.href
    if href:
    self.crawl(href,headers=headers, callback=self.detail_page,save=response.save)
    @config(priority=2,age=10 * 24 * 60 * 60)
    def detail_page(self, response):
    result = copy.deepcopy(result_template)
    result["url"] = response.url
    result["source_channel"] = response.save['source_channel']
    result["source_name"] = source_name
    if response.doc('div.contDetailsBox').html():
    result["html"] = response.doc('div.contDetailsBox').html().strip()
    result["editor"] = response.doc('p.p_author.span').text().replace('作者:','')
    result["source"] = response.doc(' p.p_artInfo span ').eq(1).text().replace('摘自:','')
    result["title"] = response.doc('h2.h2_artDetails').text()
    result["pub_time"] = response.doc('p.p_artInfo span ').eq(0).text().replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
    result["content"] = get_content_from_html(result["html"])
    result["pub_time"] = str_2_timestamp(result["pub_time"])
    result["pub_time"] = get_pub_time(result["pub_time"])
    result["datetime"] = get_now_time()
    self.send_message(self.project_name, result, url=result["url"])
    def json_handler(self, response):
    result = copy.deepcopy(result_template)
    data = json.loads(response.text)
    result["title"] = response.save['title']
    result["author"] = response.save['author']
    html = "%s" % response.save['title']
    html += data['data']['content']
    result['html'] = html
    result["content"] = get_content_from_html(html)
    result["summary"] = data['data']['content_short']
    result['pub_time'] = timestamp_to_str(response.save['display_time'])
    self.send_message(self.project_name, result, url=result["url"])
    def on_message(self, project, msg):
    return msg
    def get_content(response):
    import chardet
    from readability import Document
    import html2text
    char_encoding = chardet.detect(response.content) # bytes
    #print(char_encoding)
    if char_encoding["encoding"] == "utf-8" or char_encoding["encoding"] == "utf8":
    doc = Document(response.content.decode("utf-8"))
    else:
    doc = Document(response.content.decode("gbk","ignore"))
    title = doc.title()
    content = doc.summary()
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(content).replace("-\n","-")
    return d_data.rstrip()
    def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
    if not time_str:
    return ""
    elif len(time_str) == 9:
    fmt = "%Y-%m-%d"
    elif len(time_str) == 10:
    fmt = "%Y-%m-%d"
    elif len(time_str) == 13:
    fmt = "%Y-%m-%d %H"
    elif len(time_str) == 16:
    fmt = "%Y-%m-%d %H:%M"
    return int(time.mktime(time.strptime(time_str, fmt)))
    def get_content_from_html(html):
    import html2text
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(html).replace("-\n","-")
    return d_data.rstrip()
    def get_pub_time(response):
    #date_time = response.doc('div.content div.titleHead div.newsDate').text()
    #date_time = response.doc("div#article.article span#pubtime_baidu").text()
    #return date_time
    #timeArray = time.strptime(response, "%Y-%m-%d %H:%M:%S")
    #转换成时间戳
    #timestamp = time.mktime(timeArray)
    return str(response*10)[0:10]
    def re_search_time(time_str):
    r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
    mat = re.search(r_str, time_str)
    if not mat:
    return ""
    return mat.group(0)
    def re_sub_html(html):
    return re.sub(r'','',html)
    def get_now_time():
    return str(int(time.time()))
    
    
    基本处理
    
    
    标签拼接
    html = "%s" % result["title"]html += "%s" % response.doc("div#News_Body_Txt_A").html()
    result["title"] = response.doc('div.lasttle>p').remove() # 删除标签
    result["pub_time"] = response.doc(' div.source ').addClass('beauty')
    result["source"] = response.doc('div.article-info > span:contains("来源")').text()
    result["html"] = response.doc('div.article_content').remove('div.article_content>div:last-child').html()
    result['source'] = response.doc('div.attr span').eq(1).text().replace(u'来源:', '') # eq(index)
    response.doc('ul#news_list >li:nth-child(-n+20) > div[class|="txt"] > h3 > a') # 前20个标签
    div.newsCon section:nth-child(2) p:nth-last-child(-n+3) 后三个p标签
    response.doc("td.STYLE4").parent().parent()('tr').eq(1).text() eq(index) 0是第一行 1是第二行
    response.doc('div.weicode').nextAll().remove()
    response.doc("div.Middle4 div.Middle4_body div.txt1").nextAll().remove()
    content_table = response.doc("div.portlet table.hui12").siblings('table')
    对于tbody table tr td 过多的网页提取标签 可以只拿table标签 body > table table table table:nth-child(2) table table td
    .next() 获得匹配元素集合中每个元素紧邻的同胞元素。如果提供选择器,则取回匹配该选择器的下一个同胞元素。
    1、.next()方法的作用:指针指向下一条记录,有记录(有值)返回true并把记录内容存入到对应的对象中,也就是obj.next()的obj中。如果没有返回false。
    2、.next()方法的应用:一般和ResultSet对象和while循环一起使用,去迭代结果集,并在循环中调用getXXX(intfieldIndex)/getXXX(String columnName)方法获取字段值。
    page = source['page'] if 'page' in source else num 可以指定page=1不翻页 指定全局变量num 翻num页的 就不用再根据频道判断是否翻页
    删除标签操作
    result["html"] = re.sub(r'\<section.*?\<\/section\>','',result["html"],flags=re.S)
    result["html"] = re.sub(r'<\/section\>','',result["html"],flags=re.S)
    html = pq(result["html"])
    result["html"] = html.remove('section').html().replace('\n \n','')
    获取content的另一种方式
    def get_content_from_html(html):
    import html2text
    h = html2text.HTML2Text()
    h.ignore_links = True
    # h.ignore_images = True
    d_data = h.handle(html).replace("-\n","-")
    return d_data.rstrip()
    def re_search_time(time_str): 正则处理时间
    r_str = r"(\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2}\s\d{1,2}:\d{1,2}|\d{4}-\d{1,2}-\d{1,2})"
    mat = re.search(r_str, time_str)
    if not mat:
    return ""
    return mat.group(0)
    def make(time):时间处理
    "n天、小时、分钟、前时间" print (datetime.datetime.now()-datetime.timedelta(days=1)).strftime("%Y-%m-%d %H:%M")
    str(datetime.datetime.now().month) 获取 当前的年份、月份、日
    处理"今天08:56"这样的时间 time.strftime('%Y-%m-%d', time.localtime())+' '+result["pub_time"].replace('今天','') 或者
    str(datetime.datetime.now().month) + ' '+result["pub_time"].replace('今天','')
    处理a = "20181107" 可以用切片 拼接 a_time = '-'.join([a[0:4],a[4:6],a[6:8]])
    def code(): 编码问题解决
    json.loads(unicode) 直接将Unicode转为json
    data.decode("unicode-escape").encode('utf-8') # unicode 转中文
    pyquery对象,遍历节点 在对节点操作
    def index_page(self, response):
    for content in response.doc('article.post div.content').items():
    data= {
    'url': response.save['url'],
    'source_channel' : response.save['source_channel'],
    'source_name' : response.save['source_name'],
    'pub_time':content('div.data span.u-time').text()
    }
    print data
    self.crawl(content('h2>a').attr.href, callback=self.detail_page,save=data)
    pyspider 默认去重不会对同一地址请求
    循环post请求 地址相同 需要强制对同一URL循环爬取 解决办法发 加上itag参数 记得url后边要加上?问好号
    def index_page(self, response):
    detail_url = 'http://news.cqcoal.com/manage/newsaction.do?method:getNewsAddonarticle'
    dict_list = response.json['rows']
    for aid in dict_list:
    print aid['id']
    url = '%s?id=%s' % (detail_url,'')
    data= {
    'source_channel' : response.save['source_channel'],
    'source_name' : response.save['source_name'],
    'source_typeide' : response.save['typeid'],
    'pub_time': aid['pubdate'],
    'title': aid['title'],
    'editor': aid['fshuser'],
    'source': aid['source'],
    'content': aid['description']
    }
    timestampStr = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f")
    fakeItagForceRecrawl = "%s" % timestampStr
    url = detail_url + "#" + timestampStr
    self.crawl(url,itag=fakeItagForceRecrawl,data={'id':aid['id']}, headers=headers,callback=self.detail_page, save=data, method='POST')
    @config(age=1)
    def index_page(self, response):
    source_channel = response.save['source_channel']
    itemList =response.doc('div.trends').items()
    for each in itemList:
    param = {}
    href = each('h3 a').attr.href
    title = each('a').text()
    print('title:'+title)
    pub_time = each('span').text()
    print('pub_time:'+pub_time)
    param["title"] = title
    param["pub_time"] = pub_time
    param.update(response.save)
    if href:
    self.crawl(href, callback=self.detail_page,save = param,headers=header)
    @config(priority=1, age=10*24*60*60)
    def detail_page(self, response):
    result = copy.deepcopy(result_template)
    result.update(response.save)
    result["url"] = response.url
    source_channel = response.save['source_channel']
    result["source_name"] = source_name
    result["fetch_time"] = get_now_time()
    # 需要根据网页内容进行相应提取
    html = None
    html = response.doc("div#pageTxt").html()
    if html:
    result["html"] = html
    result["content"] = get_content_from_html(html)
    result["title"] = response.save['title']
    pub_time =""
    if response.save.__contains__('pub_time'):
    pub_time = response.save['pub_time']
    else:
    temp = response.doc("div.scd-title em").text()
    if temp.find("今天") !=-1:
    temp = "%s%s%s%s%s" %(str(datetime.datetime.now().month),"-",str(datetime.datetime.now().day)," ",temp.replace("今天",""))
    else:
    temp = temp.replace("月","-").replace("日","")
    pub_time ="%s%s%s" %(str(datetime.datetime.now().year),"-",temp)
    print("pub_time:"+pub_time)
    source="蓝鲸TMT网"
    print("source:"+source)
    result["source"] = source
    result["pub_time"] = str_2_timestamp(pub_time)
    self.send_message(self.project_name, result,url=result["url"])
    def str_2_timestamp(time_str, fmt="%Y-%m-%d %H:%M:%S"):
    if not time_str:
    return ""
    elif time_str.find(':') == -1:
    fmt = "%Y-%m-%d"
    elif len(re.findall(r':',time_str)) == 1:
    fmt = "%Y-%m-%d %H:%M"
    elif len(re.findall(r':',time_str)) == 2:
    fmt = "%Y-%m-%d %H:%M:%S"
    return int(time.mktime(time.strptime(time_str, fmt)))

    常见问题FAQ

    免费下载或者VIP会员专享资源能否直接商用?
    本站所有资源版权均属于原作者所有,这里所提供资源均只能用于参考学习用,请勿直接商用。若由于商用引起版权纠纷,一切责任均由使用者承担。更多说明请参考 VIP介绍。
    ①本站所有CMS、杰奇CMS、杰奇模板、PTCMS428、PTCMS模板、YGBOOK614、YGBOOK模板仅用于学习和交流,勿用于商业。
    ②本站资源有安装及使用文档,安装使用请自行探索,如您对准备购买的程序或是模板无法胜任安装工作,请点击付费安装。
    ③本站资源来源网络或者用户投稿,切勿私自传播于网络,否则将追究法律责任。且仅供学习交流之用,如有侵权请联系删除。
    ④如果资源失效或下载链接错误请联系站长。

    蓝大富博客 » 杰奇小说Pyspider脚本编写指南

    发表评论

    408+

    本站勉强运行

    269+

    用户总数

    170+

    资源总数

    3+

    今日更新

    2020-4-8

    最后更新时间