python代码段收集

5 行代码入门 Python 爬虫

import requests
import pandas as pd
from bs4 import BeautifulSoup
from lxml import etree
import time
import pymysql
from sqlalchemy import create_engine
from urllib.parse import urlencode  # 编码 URL 字符串https://www.makcyun.top/web_scraping_withpython18.html

start_time = time.time()  #计算程序运行时间
def get_one_page(i):
	try:
		headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36'
        }
		paras = {
		'reportTime': '2017-12-31',
		#可以改报告日期，比如2018-6-30获得的就是该季度的信息
		'pageNum': i   #页码
		}
		url = 'http://s.askci.com/stock/a/?' + urlencode(paras)
		response = requests.get(url,headers = headers)
		if response.status_code == 200:
			return response.text
		return None
	except RequestException:
		print('爬取失败')

def parse_one_page(html):
	soup = BeautifulSoup(html,'lxml')
	content = soup.select('#myTable04')[0] #[0]将返回的list改为bs4类型
	tbl = pd.read_html(content.prettify(),header = 0)[0]
	# prettify()优化代码,[0]从pd.read_html返回的list中提取出DataFrame
	tbl.rename(columns = {'序号':'serial_number', '股票代码':'stock_code', '股票简称':'stock_abbre', '公司名称':'company_name', '省份':'province', '城市':'city', '主营业务收入(201712)':'main_bussiness_income', '净利润(201712)':'net_profit', '员工人数':'employees', '上市日期':'listing_date', '招股书':'zhaogushu', '公司财报':'financial_report', '行业分类':'industry_classification', '产品类型':'industry_type', '主营业务':'main_business'},inplace = True)
	return tbl

def generate_mysql():
	conn = pymysql.connect(
		host='localhost',
		user='root',
		password='******',
		port=3306,
		charset = 'utf8',  
		db = 'wade')
	cursor = conn.cursor()

	sql = 'CREATE TABLE IF NOT EXISTS listed_company (serial_number INT(20) NOT NULL,stock_code INT(20) ,stock_abbre VARCHAR(20) ,company_name VARCHAR(20) ,province VARCHAR(20) ,city VARCHAR(20) ,main_bussiness_income VARCHAR(20) ,net_profit VARCHAR(20) ,employees INT(20) ,listing_date DATETIME(0) ,zhaogushu VARCHAR(20) ,financial_report VARCHAR(20) , industry_classification VARCHAR(20) ,industry_type VARCHAR(100) ,main_business VARCHAR(200) ,PRIMARY KEY (serial_number))'
	cursor.execute(sql)
	conn.close()
	
def write_to_sql(tbl, db = 'wade'):
    engine = create_engine('mysql+pymysql://root:******@localhost:3306/{0}?charset=utf8'.format(db))
    try:
    	tbl.to_sql('listed_company2',con = engine,if_exists='append',index=False)
    	# append表示在原有表基础上增加，但该表要有表头
    except Exception as e:
    	print(e)

def main(page):
    generate_mysql()
	for i in range(1,page):  
		html = get_one_page(i)
		tbl = parse_one_page(html)
		write_to_sql(tbl)
		
# # 单进程
if __name__ == '__main__':	
	main(178)
	endtime = time.time()-start_time
	print('程序运行了%.2f秒' %endtime)
	
# 多进程
from multiprocessing import Pool
if __name__ == '__main__':
 	pool = Pool(4)
 	pool.map(main, [i for i in range(1,178)])  #共有178页
	endtime = time.time()-start_time
	print('程序运行了%.2f秒' %(time.time()-start_time))
	
import pandas as pd
import csv
for i in range(1,178):  # 爬取全部页
	tb = pd.read_html('http://s.askci.com/stock/a/?reportTime=2017-12-31&pageNum=%s' % (str(i)))[3] 
	tb.to_csv(r'1.csv', mode='a', encoding='utf_8_sig', header=1, index=0)

时间处理

# -*- coding: utf-8 -*-
# author:           inpurer(月小水长)
# pc_type           lenovo
# create_date:      2018/12/3
# file_name:        timetest.py
# description:      月小水长，热血未凉

import time

t0 = time.time()
#description:   输出当前时间距离1970.1.1的秒数,精确到小数点后6位，也叫做时间戳
#output sample: 1543799532.602318
print(t0)

t1 = time.localtime(t0)
#description:   把时间戳转成元组,包含七个元素,前六个见名知意,tm_wday是指今天是当前周的第几天(index from 0),tm_yday类似,tm_isdst是否是夏令时,不用关心
#output sample: time.struct_time(tm_year=2018, tm_mon=12, tm_mday=3, tm_hour=9, tm_min=22, tm_sec=24, tm_wday=0, tm_yday=337, tm_isdst=0)
print(t1)
#so,可以这样输出今天是今年的第多少天
print(t1[-2]+1)


#下面是对该元组的格式化

#description:   简单可读形式
#output sample: Mon Dec  3 09:31:18 2018
t2 = time.asctime(t1)
print(t2)

#description:   可通过参数设置成各种形式，下面是一种标准形式,各参数见名知意
#output sample: 2018-12-03 09:33:36
t3 = time.strftime("%Y-%m-%d %H:%M:%S", t1)
print(t3)
#%y 两位数的年份表示（00-99）
# %Y 四位数的年份表示（000-9999）
# %m 月份（01-12）
# %d 月内中的一天（0-31）
# %H 24小时制小时数（0-23）
# %I 12小时制小时数（01-12）
# %M 分钟数（00=59）
# %S 秒（00-59）
#
# %a 本地简化星期名称
# %A 本地完整星期名称
# %b 本地简化的月份名称
# %B 本地完整的月份名称
# %c 本地相应的日期表示和时间表示
# %j 年内的一天（001-366）
# %p 本地A.M.或P.M.的等价符
# %U 一年中的星期数（00-53）星期天为星期的开始
# %w 星期（0-6），星期天为星期的开始
# %W 一年中的星期数（00-53）星期一为星期的开始
# %x 本地相应的日期表示
# %X 本地相应的时间表示
# %Z 当前时区的名称
# %% %号本身


# 下面是把格式化字符串转成元组
# description:      第一个参数个格式化后的字符串,后一个参数和格式化对应，便于反格式化
# output sample:    time.struct_time(tm_year=2018, tm_mon=12, tm_mday=3, tm_hour=9, tm_min=47, tm_sec=7, tm_wday=0, tm_yday=337, tm_isdst=-1)
t4 = time.strptime(t3,'%Y-%m-%d %H:%M:%S')
print(t4)

# 把元组转成时间戳
#description:   是time.localtime的反函数,不过由于格式化的原因，精度有所下降
#output sample: 1543801627.0
t5 = time.mktime(t4)
print(t5)

# -*- coding: utf-8 -*-
# author:           inpurer(月小水长)
# pc_type           lenovo
# create_date:      2018/12/3
# file_name:        timetest.py
# description:      月小水长，热血未凉

import datetime

#通过datetime.datetime.now()可以获得当前日期时间的一个实例
#这个实例是一个datetime类对象而不是字符串
#虽然直接打印该实例输出的是一个字符串,只是调用datetime实现的__str__方法而已
t0 = datetime.datetime.now()
print(t0)           #print: 2018-12-03 12:55:49.905971
print(type(t0))     #print: <class 'datetime.datetime'>

#然后就可以通过对象名.的方法输出各个时间信息,该信息是一个int类型
print(t0.year)          #print: 2018
print(type(t0.year))    #print: <class 'int'>
print(t0.month)
print(t0.day)
print(t0.hour)
print(t0.minute)
print(t0.second)

import datetime
t0 = datetime.datetime.now()
#注意year/month/day都是int类型，不像java那样可以直接拼接字符串和数字
wanted_time = str(t0.year)+"-"+str(t0.month)+"-"+str(t0.day)
#https://inspurer.github.io/2018/12/03/%E4%B8%80%E6%96%87%E6%90%9E%E5%AE%9Apython%E7%9A%84%E6%97%B6%E9%97%B4%E5%A4%84%E7%90%86/
import time
t0 = time.localtime()
wanted_time = time.strftime("%Y-%m-%d",t0)

error: Microsoft Visual C++ 14.0 is required

源码安装，但是没有 C++ 的编译环境

 
error: Microsoft Visual C++ 14.0 is required. Get it with "Microsoft Visual
C++ Build Tools": http://landinghub.visualstudio.com/visual-cpp-build-tools
如果是 Python 27 可以安装 Microsoft Visual C++ Compiler for Python  2.7 https://www.microsoft.com/en-us/download/details.aspx?id=44266
如果是 Python 3 可以安装 Visual C++ 2015 Build Tools http://landinghub.visualstudio.com/visual-cpp-build-tools
或者使用下载编译好的 exe 文件
或者使用 whl 格式的包

逗号引发的悲剧

>>> a = [
...     'foo'
...     'bar',
...     'tree'
... ]
>>>
>>> b = 'foo' 'bar'
>>>
>>> print a
['foobar', 'tree']
>>> print b
foobar
>>>
也就是说 'foo' + 'bar' 等价于 'foo' 'bar'

再来看另外一个例子，注意第二行后面的逗号
 
>>> a = {'foo': 'bar'}
>>> b = a.get('foo'),
>>> c = a.get('foo')
>>> print(b)
('bar',)
>>> print(c)
bar
>>>
本来 b 应该是一个字符串，结果硬是变成了元组。https://www.restran.net/2015/11/07/python-comma-issue/

Python获取Bing图片做壁纸

http://jeffyang.top/Python/python%E8%8E%B7%E5%8F%96Bing%E5%9B%BE%E7%89%87%E5%81%9A%E5%A3%81%E7%BA%B8/
def get_url(day=0):
    url = "https://www.bing.com/HPImageArchive.aspx?format=js&idx=" + str(day) + "&n=1&nc=1509675905008&pid=hp&video=1"
    return url
def get_img(url, path="D://wallpaper/"):
    isExists = os.path.exists(path)#https://github.com/JianFengY/BingSpider
    if not isExists:
        os.makedirs(path) 
    html = requests.get(url)
    content = html.json()
    src = "https://www.bing.com" + content['images'][0]['url']
    urlretrieve(src, path + content['images'][0]['enddate'] + '.jpg')
def set_wallpaper_from_bmp(bmp_path):  
    reg_key = win32api.RegOpenKeyEx(win32con.HKEY_CURRENT_USER, "Control Panel\\Desktop", 0, win32con.KEY_SET_VALUE)    
    win32api.RegSetValueEx(reg_key, "WallpaperStyle", 0, win32con.REG_SZ, "2")  
    win32api.RegSetValueEx(reg_key, "TileWallpaper", 0, win32con.REG_SZ, "0")  
    win32gui.SystemParametersInfo(win32con.SPI_SETDESKWALLPAPER, bmp_path, win32con.SPIF_SENDWININICHANGE)  
def set_wallpaper(img_path):  
    isExists = os.path.exists(img_path)
    if isExists:
        img_dir = os.path.dirname(img_path)  
        bmpImage = Image.open(img_path)  
        new_bmp_path = os.path.join(img_dir, 'wallpaper.bmp')  
        bmpImage.save(new_bmp_path, "BMP")  
        set_wallpaper_from_bmp(new_bmp_path)  
        return True
    else:
        return False

自动化测试工具from selenium import webdriver

           from selenium.webdriver.common.by import By
           from selenium.webdriver.common.keys import Keys
           from selenium.webdriver.support import expected_conditions as EC
           from selenium.webdriver.support.wait import WebDriverWait
           
           browser = webdriver.Chrome()
           try:
               browser.get('https://www.baidu.com')
               input = browser.find_element_by_id('kw')
               input.send_keys('Python')
               input.send_keys(Keys.ENTER)
               wait = WebDriverWait(browser, 10)
               wait.until(EC.presence_of_element_located((By.ID, 'content_left')))
               print(browser.current_url)
               print(browser.get_cookies())
               print(browser.page_source)
           finally:
               browser.close()

browser = webdriver.Chrome()
browser.get('https://www.zhihu.com/explore')
browser.execute_script('window.scrollTo(0, document.body.scrollHeight)')
browser.execute_script('alert("To Bottom")')

browser = webdriver.Chrome()
browser.implicitly_wait(10)
browser.get('https://www.zhihu.com/explore')
input = browser.find_element_by_class_name('zu-top-add-question')
print(input)

try:
    browser.get('https://www.baidu.com')
except TimeoutException:
    print('Time Out')
try:
    browser.find_element_by_id('hello')
except NoSuchElementException:
    print('No Element')
finally:
    browser.close()
    
    http://jeffyang.top/Python/%E7%88%AC%E8%99%AB/Python%E7%88%AC%E8%99%AB%E5%B8%B8%E7%94%A8%E5%BA%93selenium%E8%AF%A6%E8%A7%A3/

取交集，并集和差集

a = ['a','b','c']，b = ['b','c','d']

交集:

print list(set(a).intersection(set(b)))

#或者
isec = [val for val in a if val in b]
print isec

并集
print list(set(a).union(set(b)))
差集
print list(set(b).difference(set(a))) # b-a
a = [[1,2],[3,4],[1,4]]
b = [x for j in a for x in j]
print b
[1, 2, 3, 4, 1, 4]

list分割成固定长度子list

def splite_list(splist, s):
    """splite a list to sub list contain s"""
    return [splist[i:i + s] for i in range(len(splist)) if i % s == 0]

#test
list1 = range(10)
splite_list(list1,2)
[[0, 1], [2, 3], [4, 5], [6, 7], [8, 9]]

生成英文字母表

小写字母表
list(map(chr,list(range(97, 123))))

大写字母
list(map(chr,list(range(65, 91))))

小写字母表
import string
string.ascii_uppercase

大写字母
import string
string.ascii_lowercase

字典排序

按键排序
dic = {'a':11 , 'b':5 , 'c': 7}

# 升序排序
sorted(dic.keys())

#　降序排序
sorted(dic.keys(), reverse=True)

按值排序
dic = {'a':11 , 'b':5 , 'c': 7}

# 升序
sorted(dic.items(), key = lambda x:x[1])

# 降序
sorted(dic.items(), key = lambda x:x[1],reverse =True)

微信公众号或网页自动导出

https://juejin.im/post/5b4cc601f265da0f5a2545a9
https://github.com/MartinHan01/webpage2img
pip install pillow
init_filelist()
    #首先初始化webdirver
    driver = webdriver.Chrome()
    #设置输出路径
    dir = './result'
    
    for item in filelist:
        try:
            #获取图片路径，标题，以及输出路径
            #自动滚动，并截图保存
            pic_path,title = save_url(driver, item, dir)
            #开始合并我们刚刚截的所有图
            package_picture(pic_path, os.path.abspath(dir), title)
        except Exception as e :
            print(e)
python crop.py

批量压缩图片

pip install --upgrade tinify

import tinify
import os

tinify.key = '此处填入你的key'
path = "xxx" # 图片存放的路径

for dirpath, dirs, files in os.walk(path):
    for file in files:
        imgpath = os.path.join(dirpath, file)
        print("compressing ..."+ imgpath)
        tinify.from_file(imgpath).to_file(imgpath)

重试

import random
from tenacity import retry
@retry
def do_something_unreliable():
    if random.randint(0, 10) > 1:
        raise IOError("Broken sauce, everything is hosed!!!111one")
    else:
        return "Awesome sauce!"
print(do_something_unreliable())

from tenacity import *
@retry(stop=(stop_after_delay(10) | stop_after_attempt(5)), wait=wait_fixed(2))
def stop_after_10_s_or_5_retries():
    print("Stopping after 10 seconds or 5 retries")
    raise Exception
    
 重试5次，每次间隔10秒，重试前等待2秒
   def func():
       pass
   for _ in range(0,100):
       while True:
           try:
               func()
           except SomeSpecificException:
               continue
           break
 def verify_url(url):
     import requests
     try:
         requests.get(url, timeout=10)
         return True
     except requests.exceptions.ConnectTimeout:
         return False
 def main():
     for _ in range(5):
         try:
             if verify_url(''):
                 return
             else:
                 continue
         except KeyError:
             continue
 if __name__ == '__main__':
     main()          
    https://zhangslob.github.io/2019/01/14/Python%E9%87%8D%E8%AF%95%E7%9A%84%E5%A4%9A%E9%87%8D%E6%96%B9%E6%B3%95/

登录GitHub

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
import requests
headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'Accept-Encoding': 'gzip, deflate, br',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
    'Connection': 'keep-alive',
    'Host': 'github.com',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
s = requests.session()
s.headers.update(headers)
def get_token():
    url = 'https://github.com/login'
    response = s.get(url)
    pat = 'name=\"authenticity_token\" value=\"(.*?)\"'
    authenticity_token = re.findall(pat, response.text)[0]
    return authenticity_token
def login(authenticity_token, account, password):
    payload = {
        'commit': 'Sign in',
        'utf8': '\u2713',
        'authenticity_token': authenticity_token,
        'login': account,
        'password': password,
    }
    url = 'https://github.com/session'
    response = s.post(url, data=payload)
    print(response)
    # do whatever you want
if __name__ == '__main__':
    account, password = 'account', 'password'
    authenticity_token = get_token()
    login(authenticity_token, account, password)

多线程和多进程

import concurrent.futures
import urllib.request
URLS = ['http://www.foxnews.com/',
        'http://www.cnn.com/',
        'http://europe.wsj.com/',
        'http://www.bbc.co.uk/',
        'http://some-made-up-domain.com/']
# Retrieve a single page and report the URL and contents
def load_url(url, timeout):
    with urllib.request.urlopen(url, timeout=timeout) as conn:
        return conn.read()
# We can use a with statement to ensure threads are cleaned up promptly
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
    # Start the load operations and mark each future with its URL
    future_to_url = {executor.submit(load_url, url, 60): url for url in URLS}
    for future in concurrent.futures.as_completed(future_to_url):
        url = future_to_url[future]
        try:
            data = future.result()
        except Exception as exc:
            print('%r generated an exception: %s' % (url, exc))
        else:
            print('%r page is %d bytes' % (url, len(data)))
            
 import concurrent.futures
 import math
 PRIMES = [
     112272535095293,
     112582705942171,
     112272535095293,
     115280095190773,
     115797848077099,
     1099726899285419]
 def is_prime(n):
     if n % 2 == 0:
         return False
     sqrt_n = int(math.floor(math.sqrt(n)))
     for i in range(3, sqrt_n + 1, 2):
         if n % i == 0:
             return False
     return True
 def main():
     with concurrent.futures.ProcessPoolExecutor() as executor:
         for number, prime in zip(PRIMES, executor.map(is_prime, PRIMES)):
             print('%d is prime: %s' % (number, prime))
 if __name__ == '__main__':
     main()
     
     
     https://zhangslob.github.io/2018/07/03/%E5%BF%AB%E9%80%9F%E5%86%99%E4%B8%80%E4%B8%AA%E7%88%AC%E8%99%AB/

Pandas 做数据分析

import pandas as pd
import numpy as np

url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')
tips = pd.read_csv(url)
output = tips.head()
>>> output
   total_bill   tip     sex smoker  day    time  size
0       16.99  1.01  Female     No  Sun  Dinner     2
1       10.34  1.66    Male     No  Sun  Dinner     3
2       21.01  3.50    Male     No  Sun  Dinner     3
3       23.68  3.31    Male     No  Sun  Dinner     2
4       24.59  3.61  Female     No  Sun  Dinner     4

sql 语句： SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;。

output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)

https://learnku.com/articles/29825

多线程

线程 (Thread) 是操作系统能够进行运算调度的最小单位。它被包含在进程中，是进程中的实际运作单位。一个进程中可以并发多个线程，每条线程并行执行不同的任务。同一进程中的多个线程共享进程中的全部系统资源。
以下演示使用多线程对一个变量值进行修改，在循环的次数不多时修改后变量的值是符合预期的，当增加循环次数后，变量最终的值并不符合预期。由此可见：线程之间资源是存在竞争的，修改同一份资源必须加互斥锁，同时需要避免死锁。

# coding=utf-8
import threading

# 定义一个字段。多线程执行+1操作
balance = 0

def worker1():
    global balance
    for i in range(1000):
        balance += 1
    print('线程1执行完成，balance='+str(balance))

def worker2():
    global balance
    for i in range(1000):
        balance += 1
    print('线程2执行完成，balance='+str(balance))

def main():
    # 构造线程对象
    t1 = threading.Thread(target=worker1)
    t2 = threading.Thread(target=worker2)
    # 开始执行
    t1.start()
    t2.start()

    """
    循环次数为1000时，程序输出：
        线程1执行完成，balance=1000
        线程2执行完成，balance=2000
    循环次数为1000000时，程序输出：
        线程1执行完成，balance=1180919
        线程2执行完成，balance=1179703
    """    

if __name__ == '__main__':
    main()
要想解决以下的问题，需要使用线程的锁对象，只需要对 worker1 和 woker2 方法进行修改。

# 创建一个互斥锁，默认是未锁定状态
mutex = threading.Lock()

def worker1():
    global balance
    for i in range(1000000):
        mutex.acquire()
        balance += 1
        mutex.release()
    print('线程1执行完成，balance=' + str(balance))

def worker2():
    global balance
    for i in range(1000000):
        mutex.acquire()
        balance += 1
        mutex.release()
    print('线程2执行完成，balance=' + str(balance))

"""
加了互斥锁之后的输出：
    线程1执行完成，balance=1941343
    线程2执行完成，balance=2000000
"""
特点：

线程执行的顺序是不确定的
主线程【进程】会等待所有子线程结束后才会退出，主线程【进程】结束么子线程必然结束
线程间共享资源
修改资源必要时需要加锁，同时避免死锁
占用的资源比进程少
线程并不是越多越快
由于 GIL 的原因，多线程并不是真正的并发，只是交替执行
https://learnku.com/articles/29367

尾递归

# 例子代码
def tail_recursion(n, total=0):
    if n == 0:
        return total
    else:
        return tail_recursion(n-1, total+n)
# 执行结果：
tail_recursion(5)
tail_recursion(4, 5)
tail_recursion(3, 9)
tail_recursion(2, 12)
tail_recursion(1, 14)
tail_recursion(0, 15)
5+4+3+2+1=15

获取公众号全部文章

https://mp.weixin.qq.com/s/nkW2sYLcdsNTYTkk-4BeLA
import requests
import json
import time
from pymongo import MongoClient

url = 'http://mp.weixin.qq.com/mp/xxx'（公众号不让添加主页链接，xxx表示profile_ext)
#url='https://mp.weixin.qq.com/mp/profile_ext?action=getmsg&__biz=MzIyMjg2ODExMA==&f=json&offset=21&count=10&is_ok=1&scene=124&uin=NjQ3OTQwMTAy&key=a90c16d3bbfeedd04adeeda7bfc81049f486e81712f95a347e33fccfb9fe00841ec6a4d0984ce32f72fe5e8c479fd13c6680b5496cda322ab1bb2b81417a10ae277a861ad580e77cdf78edbf86212c08&pass_ticket=2vonvdf3N4L67te2BCa4ZqvIs1ed2MoeBqoznvfNSL%2BeKqF4YgHUvNEWLNczZovz&wxtoken=&appmsg_token=1015_jLHC7BDStvidMqo9YO55XLerjoP9z6UM70Q5vw~~&x5=0&f=json'
# Mongo配置
conn = MongoClient('127.0.0.1', 27017)
db = conn.wx  #连接wx数据库，没有则自动创建
mongo_wx = db.article  #使用article集合，没有则自动创建

def get_wx_article(biz, uin, key, index=0, count=10):
    offset = (index + 1) * count
    params = {
        '__biz': biz,
        'uin': uin,
        'key': key,
        'offset': offset,
        'count': count,
        'action': 'getmsg',
        'f': 'json'
    }

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
    }

    response = requests.get(url=url, params=params, headers=headers)
    resp_json = response.json()
    if resp_json.get('errmsg') == 'ok':
        resp_json = response.json()
        # 是否还有分页数据， 用于判断return的值
        can_msg_continue = resp_json['can_msg_continue']
        # 当前分页文章数
        msg_count = resp_json['msg_count']
        general_msg_list = json.loads(resp_json['general_msg_list'])
        list = general_msg_list.get('list')
        print(list, "**************")
        for i in list:
            app_msg_ext_info = i['app_msg_ext_info']
            # 标题
            title = app_msg_ext_info['title']
            # 文章地址
            content_url = app_msg_ext_info['content_url']
            # 封面图
            cover = app_msg_ext_info['cover']

            # 发布时间
            datetime = i['comm_msg_info']['datetime']
            datetime = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(datetime))

            mongo_wx.insert({
                'title': title,
                'content_url': content_url,
                'cover': cover,
                'datetime': datetime
            })
        if can_msg_continue == 1:
            return True
        return False
    else:
        print('获取文章异常...')
        return False


if __name__ == '__main__':
    biz = 'Mzg4MTA2Nzg0NA=='
    uin = 'NDIyMTI5NDM1'
    key = '20a680e825f03f1e7f38f326772e54e7dc0fd02ffba17e92730ba3f0a0329c5ed310b0bd55b3c0b1f122e5896c6261df2eaea4036ab5a5d32dbdbcb0a638f5f3605cf1821decf486bb6eb4d92d36c620'
    index = 0
    while 1:
        print(f'开始抓取公众号第{index + 1} 页文章.')
        flag = get_wx_article(biz, uin, key, index=index)
        # 防止和谐，暂停8秒
        time.sleep(8)
        index += 1
        if not flag:
            print('公众号文章已全部抓取完毕，退出程序.')
            break

        print(f'..........准备抓取公众号第{index + 1} 页文章.')

PHP 与 Python 代码对比

# -*- coding: utf-8 -*-
n = 0
https://learnku.com/articles/30958
while n < 3:
    #累计次数,用于循环条件
    n = n + 1
    #定义账号和密码
    uname = 'tangqingsong'
    pwd = '123123'
    #接收参数
    username = input('请输入用户名:')
    password = input('请输入密码:')

    #判断用户输入的账号和密码是否正确，正确将提示成功，并且退出循环体
    if uname == username and pwd == password:
        print ('恭喜你，登陆成功~')
        break
    #三次机会用完的时候,提示错误次数，并告知即将退出
    elif n == 3:
        print('已错误', n, '次，即将退出...')
    #如果在三次以内,提示还剩下几次机会
    else:
        print('抱歉，账号或密码不正确，你还有', 3 - n, '次机会')
        
 $n = 0;
 
 while ($n < 3) {
     #累计次数,用于循环条件
     $n = $n + 1;
     #定义账号和密码
     $uname = 'tangqingsong';
     $pwd = '123123';
     #接收参数
     fwrite(STDOUT, '请输入用户名：');
     $username = trim(fgets(STDIN));
     fwrite(STDOUT, '请输入密码：');
     $password = trim(fgets(STDIN));
 
     #判断用户输入的账号和密码是否正确，正确将提示成功，并且退出循环体
     if ($uname == $username and $pwd == $password) {
         print_r('恭喜你，登陆成功~');
         break;
         #三次机会用完的时候,提示错误次数，并告知即将退出
     } elseif ($n == 3) {
         print_r("已错误{$n}次，即将退出...");
     } else {
         #如果在三次以内,提示还剩下几次机会
         $j = 3 - $n;
         print_r("抱歉，账号或密码不正确，你还有{$j}次机会");
     }
 }

python2与3的编码

Python2有两种表示字符序列的类型，分别叫做str和Unicode，str实例包含原始的8位值；而Unicode的实例，则包含Unicode字符。

str格式本质含义是“某种编码格式”，绝大多数情况下，被引号框起来的字符串，就是str，这时的字符串编码类型，其实就是你Python文件的编码类型，比如在Windows里，默认用的是GBK编码。
Unicode格式的含义就是“用unicode编码的字符串”。Python在进入2.0版后正式定义了了Unicode字符串这个奇怪的特性，目的就是为了处理太多种语言编码的文本。从那时开始，Python语言中的字符串类型就分为两种：一种是传统的Python字符串（各种花样编码），另一种则是新出现的Unicode
Python3也有两种表示字符序列的类型：bytes和str。前者的实例包含原始的8位值，后者的实例包含Unicode字符,可以说python3的str，就是python2的Unicode

str格式的定义变更为”Unicode类型的字符串“，也就是说在默认情况下，被引号框起来的字符串，是使用Unicode编码的。
而“不是Unicode的某种编码格式”，比如UTF-8、GBK，这些编码方式被定义为了bytes，这里的bytes和py2中的str有很多相似的地方
我们需要编写两个辅助（helper）函数，以便在这两种情况之间转换，使得转换后的输入数据能够符合开发者的预期

#在Python3中，我们需要编写接受str或bytes，并总是返回str的方法：
def to_str(bytes_or_str):
  if isinstance(bytes_or_str, bytes):
    value = bytes_or_str.decode('utf-8')
  else:
    value = bytes_or_str
  return value # Instance of str
  
#另外，还需要编写接受str或bytes，并总是返回bytes的方法：
def to_bytes(bytes_or_str):
  if isinstance(bytes_or_str, str):
    value = bytes_or_str.encode('utf-8)
  else:
    value = bytes_or_str
  return value # Instance of bytes
  
#在Python2中，需要编写接受str或unicode，并总是返回unicode的方法：
#python2
def to_unicode(unicode_or_str):
  if isinstance(unicode_or_str, str):
    value = unicode_or_str.decode('utf-8')
  else:
    value = unicode_or_str
  return value # Instance of unicode
  
#另外，还需要编写接受str或unicode，并总是返回str的方法：
#Python2
def to_str(unicode_or_str):
  if isinstance(unicode_or_str, unicode):
    value = unicode_or_str.encode('utf-8')
  else:
    value = unicode_or_str
  reutrn vlaue # Instance of str
  https://xin053.github.io/2016/10/30/Python%E5%AD%A6%E4%B9%A0%E9%87%8D%E7%82%B9%E6%91%98%E8%AE%B0/
  
   str包含一个encode方法，使用特定编码将该字符串其转换为一个bytes，这称之为编码
  。bytes类包含了一个decode方法，也接受一个编码作为单个必要参数，并返回一个str，
  这称之为解码。
 
  s='n排球①’
  b1=s.encode（'utf-8'）
  b2=s.encode（）
  print（b1）
  print（b2）
  b\xcf\x80\xe6\x8e\x92\xe7\x9e\x83\xe3\×8l\xae'
  b'\xcf\×80\xe6\x8e\x92\xe7\x9e\x83\xe3\x8l\xae'
  
  inport sys 
  print（sys.platform）
  print（sys.getdefaultencoding（））
  win32utf-8
  可以看出我这个平台默认选择的就是utf-8编码方式。
  b=b'\xe6lx8e\x92\xe7\x9e\x83'
  s1=b.decode（encoding='utf-8"）
  s2=b.decode（）
  s3=b.decode（encoding='latin-1'）
  print（s1）
  print（s2）
  print（s3）
  排球
  排球
  e2'cf
  
  >>> '请'.encode('unicode-escape')
  b'\\u8bf7'
  >>> b'\u8bf7'.decode('unicode-escape')
  '请'
  
  s='apple'
  b=b'apple'
  print（b）
  print（type（b））
  print（s）
  print（type（s））
  b'apple'
  <class'bytes'>
  apple
  <class'str'>
  再近距离的看看bytes类型字节字符串，本质上它就是一串单字节16进制数b=b'apple' https://www.zhihu.com/question/35584979
  print（b[0]）
  print（b[1：]）
  print（1ist（b））
  97
  b'pple'
  [97，112，112，188，101]
  
  s=' AABec'
  with open(' utf-8data','w', encoding=' utf-8") as f: ' +
   'f. urite(s)
  with open(' utf-8data','r', encoding=' utf-8') as f: 
  u_str=f. read()
  print(u_str)
  AABeC
  with open(' utf-8data",' rb') as f: byte_str=f. read()
  print(byte_str)
  print(byte_str. decode(encoding=' utf-8))
  b'A\ XC3\X84B\ xC3\ xa8c'
  AABeC

Win10 下 Python2 与 Python3 兼容问题

新旧2个文件，加入环境变量
python2.bat

@echo off

rename "C:\Program Files\Python37\python.exe" python.exe.disabled
rename "C:\Program Files\Python37\Scripts\pip.exe" pip.exe.disabled
python3.bat

@echo off

rename "C:\Program Files\Python37\python.exe.disabled" python.exe
rename "C:\Program Files\Python37\Scripts\pip.exe.disabled" pip.exe

https://learnku.com/articles/31141

剪刀、石头、布

#剪刀、石头、布
import random
guess_list = ["石头", "剪刀", "布"]
win_combination = [["布", "石头"], ["石头", "剪刀"], ["剪刀", "布"]]
while True:
    people = input('请输入：石头,剪刀,布\n').strip()
    computer = random.choice(guess_list)
    print('电脑出拳：'+computer)
    if people not in guess_list:
        print('咦~~弄啥类你！~~~')
        continue
    if computer == people:
        print ('平手，再玩一次！')
    elif [computer, people] in win_combination:
        print ('电脑获胜！继续吧~~~')
    else:
        print ('你获胜！')
        break
    print('---------------------------------')

print('Press any key to exit');
input();                                        #防止控制台输出秒退

md5和sha1加密


import hashlib
 
data =  'This a md5 test!'
hash_md5 = hashlib.md5(data)

hash_md5.hexdigest()
MD5不仅仅是上面这个例子这样用来处理字符串，还有更广泛的用途：
加密网站注册用户的密码。 （但去年的各大网站密码泄漏事件确实让人蛋疼……）
网站用户上传图片 / 文件后，计算出MD5值作为文件名。（MD5可以保证唯一性）
key-value数据库中使用MD5值作为key。
比较两个文件是否相同。（大家在下载一些资源的时候，就会发现网站提供了MD5值，就是用来检测文件是否被篡改）
用MD5来检测两个文件是否相同，但想想，如果是两个很大的文件，担心内存不够用，这时怎么办？
这就要使用 update 方法了。代码如下：
import hashlib

def get_file_md5(f):
    m = hashlib.md5()

    while True:
        data = f.read(10240)
        if not data:
            break

        m.update(data)
    return m.hexdigest()


with open(YOUR_FILE, 'r') as f:
    file_md5 = get_file_md5(f)
(windows 用户 要使用 'rb'方式打开文件)
大家可以用下面这段代码验证一下：
import hashlib

x = hashlib.md5()
x.update('hello, ')
x.update('python')
x.hexdigest()

hashlib.md5('hello, python').hexdigest()
这两次的输出是一样的。
SHA1 也是一样的用法。https://p0sec.net/index.php/archives/33/

concat 组合 dataframe

import pandas as pd
india_weather = pd.DataFrame({
    'city': ['mumbai', 'delhi', 'banglore'],
    'temperature': [32, 34, 30],
    'humidity': [80, 60, 72]
})
us_weather = pd.DataFrame({
    'city': ['newyork', 'chicago', 'orlando'],
    'temperature': [21, 24, 32],
    'humidity': [68, 65, 70]
})
df = pd.concat([india_weather, us_weather])
df = pd.concat([india_weather, us_weather], ignore_index=True)

df = pd.concat([india_weather, us_weather], keys=['india', 'us'])

df.loc['india']
df = pd.concat([temperature_df, windspeed_df], axis=1)
https://learnku.com/articles/26025

数学问题

数学问题：假如一个星球有 100 人，每年人数翻一倍。那么，多少年之后人数才有 100 万人
>>> p=100
>>> y=0
>>> while p<1000000:
...       p*=2
...       y+=1
...
>>> y
14

Python语法速查

http://www.ikeguang.com/2019/03/17/python-sytnax/
a = [1, 2]
b = a
print(id(a) - id(b))  # 地址差为 0，表示实质是同址的
0
b.append(3)
print(a)  # 只改动了 b，但 a 也跟着变动了
[1, 2, 3]
a is b
True
使用切片来重新分配空间：

 
a is a[:]
False
运算两数中只要有一个浮点数，结果就是浮点数；
整数相除，即使能除尽，结果也是浮点数；
Python 内部的机制解决了整数溢出的问题，不用担心。
序列主要包括字符串（str）、列表（list）与元祖（tuple）三类。
>>> 'ab'.index('b')
1
>>> 'b' in 'ab'
True
>>> max([1,2,3])
3
s = " I love Python"  # 首位是空格
lst = s.split(' ')
lst1 = '-'.join(lst)
strip() 去掉字符串首尾两端的空格。方法 lstrip()/rstrip() 则只切除首端/尾端的空格。
'I like {} and {}'.format('Python', 'you')
'I like Python and you'
'{0} + {2} = {1}'.format (10, 20, 'Python ')  # 按顺序引用
'10 + Python  = 20'
'{0} * {1} = {0}'.format (10, 'Python ')  # 编号反复引用
'10 * Python  = 10'

bing搜索

import requests, re, time, webbrowser, codecs
print('==========搜索引擎==========')
time.sleep(0.7)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 
 'Accept-Encoding':'gzip, deflate, sdch', 
 'Accept-Language':'zh-CN,zh;q=0.8', 
 'Cache-Control':'max-age=0', 
 'Connection':'keep-alive', 
 'Cookie':'SRCHD=AF=NOFORM; SRCHUID=V=2&GUID=E4CB65F3BD7F4EC7922E3642567A39EC&dmnchg=1; _EDGE_V=1; MUID=24CC781F18B266D70F9C758D199C670F; MUIDB=24CC781F18B266D70F9C758D199C670F; SRCHUSR=DOB=20190707&T=1562487393000; SNRHOP=I=&TS=; _EDGE_S=mkt=zh-cn&SID=2C85ED242A1D66051D4FE0B62B33673B; _SS=SID=2C85ED242A1D66051D4FE0B62B33673B&HV=1562490664; SRCHHPGUSR=CW=1089&CH=1742&DPR=1&UTC=480&WTS=63698084193&PR=3', 
 'DNT':'1', 
 'Host':'cn.bing.com', 
 'Upgrade-Insecure-Requests':'1', 
 'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'}
try:
    try:
        while True:
            web_ids = {}
            test = []
            search = input('输入搜索的内容：')
            page = [11, 21, 31, 41, 51]

            def catch():
                global webs
                id_times = 0
                for t in page:
                    url = 'https://cn.bing.com/search?q=' + search + '&qs=n&sp=-1&pq&sc=0-5&sk=&cvid=275512403280414F9363B7CDC7368CBD&first=' + str(t) + '&FORM=PERE'
                    text = requests.get(url, headers=headers).text
                    a1 = '<h2>(.*?)</h2>'
                    a2 = 'href="(.*?)"'
                    a3 = '>(.*?)</a>'
                    title = re.findall(a1, text)
                    for j in title:
                        id_times = id_times + 1
                        title = re.findall(a3, j)
                        webs = re.findall(a2, j)
                        web_ids[id_times] = webs
                        print(id_times, title[0])
                        print('网址：%s' % webs[0])
                        print()
                        if len(webs) > 0:
                            test.append(1)

                if len(test) > 0:
                    print('打开网址（如需打开多个网址，请用英文符号“,”，使用其它键默认不打开任何网址）')
                    open_web = input('网址编号：').split(',')
                    if 'n' not in open_web:
                        for aweb in open_web:
                            if aweb.isdigit():
                                ty = int(aweb)
                                if ty in web_ids:
                                    web = web_ids[ty]
                                    webbrowser.open(web[0])

                    else:
                        print('没有符合您的搜索结果！')
                    print('==============================')

            catch()

    except KeyboardInterrupt:
        exit()

except requests.exceptions.ConnectionError:
    print('抱歉，网络出现了一点问题！')//https://learnku.com/articles/32422
    time.sleep(3)
    exit()

requests-html

pip install requests-html
from requests_html import HTMLSession
session = HTMLSession()
# GET请求
url = "http://kaoshi.edu.sina.com.cn/college/scorelist?tab=batch&wl=1&local=2&batch=&syear=2013"
r = session.get(url)
r.encoding='utf-8'  # 解决中文乱码问题
print(r.text)
# 获取的网页的内容存储到本地
with open('test.html','wb') as f:
    f.write(r.content)

# POST请求
url = 'https://shuju.wdzj.com/plat-info-target.html'
params = {'wdzjPlatId': 59,'type': 1, 'target1': 1, 'target2': 0}
r = session.post(url, params=params)
print(r.text)

###定制请求头
headers = {'user-agent': 'my-app/0.0.1'}
r = session.get(url, headers=headers)
r = session.get('http://www.w3school.com.cn')
print(r.html.links)

# 输出 (太多，中间省略部分)
{'http://www.w3ctech.com/', '/glossary/index.asp', '/html5/html5_quiz.asp', '/php/index.asp', '/asp/index.asp', '/php/php_ref_date.asp', 'http://wetest.qq.com/?from=links_w3school', '/asp/asp_ref.asp', '/tags/index.asp', '/xmldom/index.asp', '/example/csse_examples.asp', '/w.asp', '/index.html', 'http://weibo.com/w3schoolcomcn', '/ws.asp', '/b.asp', '/cssref/index.asp', '/jquerymobile/index.asp',
...
'/xsl/xsl_languages.asp',
'/example/html_examples.asp'}

# 获取绝对地址
t = r.html.absolute_links
print(t)
{'http://www.w3school.com.cn/media/index.asp',
'http://www.w3school.com.cn/glossary/index.asp',
'http://www.w3school.com.cn/php/php_ref.asp',
'http://www.w3school.com.cn/site/index.asp',
...
'http://www.w3school.com.cn/asp/asp_quiz.asp'}
# 获取3cschoool首页左侧的菜单列表  first=True表示找到的第一个‘HTML教程’
menuList = r.html.find('#navsecond > ul', first=True)
print(menuList.text)

# 输出
HTML
HTML5
XHTML
CSS
CSS3
TCP/IP

# 找出所有菜单的标题和链接
menuList = r.html.find('#navsecond > ul')
for menu in menuList:
    print(menu.text)  # 获得标题
    print(menu.absolute_links)  # 获得链接

# 输出
HTML
HTML5
XHTML

from requests_html import HTMLSession
import requests

session = HTMLSession()


# 背景图片地址
url = "http://www.win4000.com/wallpaper_2285_0_10_1.html"
r = session.get(url)

# 新建bg文件夹
if not os.path.exists('bg'):
    os.mkdir('bg')

# 保存图片到bg/目录
def save_image(url, title):
    img_response = requests.get(url)
    with open('./bg/'+title+'.jpg', 'wb') as file:
        file.write(img_response.content)

# 查找页面中图片列表，找到链接，
# 点击链接，访问查看大图，并获取大图地址pic-large
items_img = r.html.find('ul.clearfix > li > a')
for img in items_img:
    img_url = img.attrs['href']
    if "/wallpaper_detail" in img_url:
        r = session.get(img_url)          # 解析图片详情
        item_img = r.html.find('img.pic-large', first=True)
        url = item_img.attrs['src']       # 大图图片地址
        title = item_img.attrs['title']   # 图片标题
        print(url+title)
        save_image(url, title)
        
http://www.golang365.com/#/blog/17

输出今天日期

import time

# 获取今天年月日
nowdate = time.localtime(time.time())  # 获得当前时间戳
today = time.strftime('%Y-%m-%d %H:%M:%S', nowdate)  # 转换成指定格式
print(today)

保存json文件

import io

sendData = [
  {
    id: 1,
    name: '奥特曼'
  },{
    id: 2,
    name: '小怪兽'
  }
]

with io.open('data.json', 'w', encoding="utf-8") as file:
    json.dump(sendData, file, ensure_ascii=False, sort_keys=True, indent=2)
print('保存成功')#http://www.golang365.com/#/blog/18

openCV

pip install --upgrade setuptools

pip install numpy Matplotlib

pip install opencv-python
如果多次下载失败，可以从 http://www.lfd.uci.edu/~gohlke/pythonlibs/ 直接下载whl包安装，安装whl包依然使用pip

#导入cv模块
import cv2 as cv
#读取图像，支持 bmp、jpg、png、tiff 等常用格式
img = cv.imread(r"E:\python\test.jpg")
#创建窗口并显示图像
cv.namedWindow("Image")
cv.imshow("Image",img)
cv.waitKey(0)
#释放窗口
cv2.destroyAllWindows() 
http://www.golang365.com/#/blog/19

图片转pdf

pip install reportlab
import sys
from reportlab.pdfgen import canvas

# 生成多页pdf 生成一个3页的pdf文件
def texttopdf():
    c = canvas.Canvas('text.pdf')
    c.drawString(100, 100, "Some text in first page.")
    c.showPage()
    c.drawString(100, 100, "Some text in second page.")
    c.showPage()
    c.drawString(100, 100, "Some text in third page")
    c.showPage()
    c.save()

texttopdf()
print('转换成功！')

# 单张图片转pdf，图片不失真。比较清晰

import sys
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from PIL import Image

# 如果有输指定文件则转换参数里的图片，否则转换test.jpg文件
if len(sys.argv) > 1:
    img = sys.argv[1]
    filename = img.split('.')[0]
    f_jpg = filename+'.jpg'
    f_pdf = filename+'.pdf'
    print(f_jpg)

else:
    img = 'wechat.png'
    f_pdf = 'test.pdf'


def imgtopdf():
    (maxw, maxh) = Image.open(img).size
    c = canvas.Canvas(f_pdf, pagesize=(maxw, maxh))
    c.drawImage(img, 0, 0, maxw, maxh)
    c.showPage()
    c.save()


imgtopdf()
print('转换成功！')
https://github.com/sweida/python-study/tree/master/imgToPdf
http://www.golang365.com/#/blog/22

模拟登录

import requests
session = requests.session()

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

def login():
    url = 'http://119.29.27.100/apis/login'
    data = {
        'username': XXX,
        'password': *******
    }
    response = session.post(url, data=data, headers=headers)

    responseData = response.json()
    if responseData['status']==1:
        print('登录成功')
        comment()
    else:
        print('登录失败', '失败原因：', responseData['msg'])


def comment():
    url = 'http://119.29.27.100/apis/message/add'
    data = {
        'content': '这条应该是有登录的',
        'ykname': ''
    }
    response = session.post(url, data=data, headers=headers)

    responseData = response.json()
    if responseData['status'] == 1:
        print('留言成功')
    else:
        print('留言失败', '失败原因：', responseData['msg'])

login()

import requests

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

# 利用浏览器登录后得到的cookie，
cookie_str = r'_myFavMv=%5B%5D; td_cookie=3034830472; laravel_session=eyJpdiI6Ik4wSjFSUU1wcFo1SndYRFliNWZZeXc9PSIsInZhbHVlIjoiMHJWZzM1WmpGRXp6NWVLYk9OaUdHOVVzcWRNK25lQ21lMFhIcmk4eUxKcEFMSnhwSDBMbTFyM3duUllqT3IycGRIc3V2TGhzWEdWaytWRkpzT3hNelE9PSIsIm1hYyI6ImNiMjRhMGFiYTIxYWJhMjUwZDJlNGI2ODgzY2ZiYzY4ZGY4NzI0MDQ4OGZkN2RiNGIwZGM2M2I2YmExYmY3OGIifQ%3D%3D'

#把cookie字符串处理成字典，以便接下来使用
cookies = {}
for line in cookie_str.split(';'):
    key, value = line.split('=', 1)
    cookies[key] = value

def comment():
    url = 'http://119.29.27.100/apis/message/add'
    data = {
        'content': '再试一条cookie请求',
        'ykname': ''
    }
    response = requests.post(url, data=data, headers=headers, cookies=cookies)

    responseData = response.json()
    # print(responseData)
    if responseData['status'] == 1:
        print('留言成功')
    else:
        print('留言失败', '失败原因：', responseData['msg'])

comment()
http://www.golang365.com/#/blog/54

猫眼票房

import re
import time
import datetime
import base64

import requests
from requests.exceptions import RequestException
from pyquery import PyQuery as pq
from fontTools.ttLib import TTFont

font = TTFont('font1.woff')
uni_list = font.getGlyphOrder()[2:]
first_match = {
    'uniE893': '0',
    'uniF690': '1',
    'uniF55C': '2',
    'uniF28F': '3',
    'uniF4B1': '4',
    'uniE623': '5',
    'uniF294': '6',
    'uniEEC4': '7',
    'uniE577': '8',
    'uniE77B': '9'
}

def get_one_page(date):
    headers = {
        'User-Agent': os.getenv('User_Agent')
    }
    url = 'https://piaofang.maoyan.com/?ver=normal&date=' + date
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.text
        else:
            return None
    except RequestException as e:
        print("Requests {}, Error {}.".format(date, e.args))
        return None

def parse_font(html):
    fonts = re.findall(r'base64,(.*?)\)', html, re.S)[0]
    # fonts = re.search(r'base64,(.*?)\)', html, re.S)
    fonts = base64.b64decode(fonts)
    with open('tmp.woff', 'wb') as fp:
        fp.write(fonts)
    font1 = TTFont('tmp.woff')
    # obj_list1 = font1.getGlyphNames()[1:-1]
    uni_list1 = font1.getGlyphOrder()[2:]
    tmp_match = {}
    for uni1 in uni_list1:
        obj1 = font1['glyf'][uni1]  #获取编码 uni1 在 tmp.ttf 中对应的对象
        for uni in uni_list:
            obj = font['glyf'][uni]
            if obj==obj1:
                tmp_match[uni1] = first_match[uni]
    return tmp_match

def rebuild_number(number, tmp_match):
    '''还需要对数字进行改写'''
    result = ''
    for num in number:
        s = str(hex(ord(num)))
        s = s.upper().replace('0X', 'uni')
        if s in tmp_match.keys():
            result += tmp_match[s]
        else:
            result += num
    return result

def parse_one_page(html):
    tmp_match = parse_font(html)
    doc = pq(html)
    today = doc('.today').text()[:10]
    movies = doc('#ticket_tbody ul').items()
    for movie in movies:
        result = {}
        result['date'] =  today
        result['movieName'] = movie.find('.c1 b').text()
        result['releaseInfo'] = movie.find('.c1 em').text().split()[0]
        result['sumBoxInfo'] = rebuild_number(movie.find('.c1 em i').text(), tmp_match)
        result['boxInfo'] =  rebuild_number(movie.find('.c2').text(), tmp_match)
        result['boxRate'] = rebuild_number(movie.find('.c3').text(), tmp_match)
        result['showRate'] = rebuild_number(movie.find('.c4').text(), tmp_match)
        result['avgSeatView'] = rebuild_number(movie.find('.c5').text(), tmp_match)
        yield result

def main():
    start_date = datetime.date.today()
    for i in range(0, 31):
        date = start_date - datetime.timedelta(days=i)
        html = get_one_page(date.isoformat())
        for result in parse_one_page(html):
            print(result)
        time.sleep(1)

if __name__ == '__main__':
    main()
    https://learnku.com/articles/32534#reply104205

锟斤拷

>>> s = (u'\uFFFD'.encode('utf8')*2)
>>> print(s.decode('gbk'))
锟斤拷
当unicode遇到解释失败的字时，会尝试用 「U+FFFD」 来代替，「U+FFFD」乃是 unicode 的一个占位符， 显示为 �
http://cuihuan.net/2019/05/12/%E5%AD%97%E7%AC%A6%E7%BC%96%E7%A0%81%E9%82%A3%E4%BA%9B%E4%BA%8B%E5%84%BF/

###

pip intall nonude
import nude
print(nude.is_nude("godfather.jpg"))
print(nude.is_nude("leisheng.jpg"))
print(nude.is_nude("qiaoba.png"))

import glob
import itertools
from nude import Nude

images_format = ['jpg', 'png', 'gif']  # 图片格式
images_jpg = glob.glob("E:/Images/OOXX/*.jpg")  # 返回匹配指定模式的文件名
images_png = glob.glob("E:/Images/OOXX/*.png")
images_gif = glob.glob("E:/Images/OOXX/*.gif")

images_list = itertools.chain(images_jpg, images_png, images_gif)

for i in images_list:
    print(i)  # 输出照片的路径
    n = Nude(i)  # 对图片进行识别
    n.parse()
    print(n.result)  # 输出结果
    print(n.message)  # 输出判断信息
    print(n.inspect())  # 输出更加详细的判断信息

 
原文: https://lbjheiheihei.xyz/2018/05/14/Use-Python-Identifying-Porngraphic-Images.html

安装scrapy失败

依次安装lxml、pyOpenSSL、Twisted、pywin32 这些基本库都要安装好。
    
   pip install lxml
  
   
   
   如果不行，则去下面的网站下载。
   
   https://pypi.org/project/lxml/#files
    https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml
   比如我这台电脑是 Python3.6，32位的就下载 lxml-4.2.1-cp36-cp36m-win32.whl 进入 cmd，然后 cd 到文件的路径下，接着就是
   
   pip install lxml-4.2.1-cp36-cp36m-win32.whl
    
   命令后面那一部分要和文件名保持一致，也就是 pip install 文件名.whl回车，等一会就安装好了
  pip install lxml
  PythonCopy
  
  
  如果不行，则去下面的网站下载。
  
  https://pypi.org/project/lxml/#files https://www.lfd.uci.edu/~gohlke/pythonlibs/#lxml
  比如我这台电脑是 Python3.6，32位的就下载 lxml-4.2.1-cp36-cp36m-win32.whl 进入 cmd，然后 cd 到文件的路径下，接着就是
  
  pip install lxml-4.2.1-cp36-cp36m-win32.whl
  PythonCopy
  命令后面那一部分要和文件名保持一致，也就是 pip install 文件名.whl回车，等一会就安装好了
  
   
  原文: https://lbjheiheihei.xyz/2018/05/27/Install-Scrapy-In-Window.html

ChromeDriver

https://sites.google.com/a/chromium.org/chromedriver/downloads http://phantomjs.org/
import time
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument("--incognito")  # 隐身模式打开
driver_path = "C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe"  # chromedriver.exe 的路径
browser = webdriver.Chrome(executable_path=driver_path, options=options)
browser.get("https://kejibear.xyz/auth/login")  # 网址
browser.find_element_by_css_selector(".card-inner input[name='Email']").send_keys("@qq.com")  # 账号
browser.find_element_by_css_selector(".card-inner input[name='Password']").send_keys("1")  # 密码
browser.find_element_by_css_selector(".row .col-md-10.col-md-push-1 button.waves-effect").click()
print("登录成功~")
time.sleep(3)
browser.find_element_by_css_selector(".card-action-btn #checkin-btn button.waves-effect").click()
print("签到成功~")
time.sleep(5)
browser.close()

生成彩色动态二维码

pip install myqr
from MyQR import myqr
version, level, qr_name = myqr.run(
    words='dhb cdfb64%vjk',  # 不支持中文，支持 0~9,a~z, A~Z 以及常见的常用英文标点符号和空格
    version=2,  # 版本，从 1至 40
    level='H',  # 纠错等级，范围是L、M、Q、H，从左到右依次升高
    picture='4e.jpg',  # 文件要放在目录下
    colorized=True,   # True 为彩色，False 为黑白
    contrast=1.0,  # 对比度
    brightness=1.0,  # 亮度
    save_name='1d6.bmp',  # 命名随便都行，格式可以是 jpg,png,bmp,gif
    save_dir="F:\二维码"  # 路径要存在
)

myqr 666 -p 666.png -c
https://lbjheiheihei.xyz/2018/04/26/Use-Python-Generate-Colorful-QRcode.html

爬取简书用户的动态

import requests
from lxml import etree

my_header = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline', headers={'user-agent': my_header})

if '大神带我来搬砖' in res.text:
    print('found')
page = etree.HTML(res.text)
last_li = page.xpath('''//ul[@class="note-list"]/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1

file = open("activity.txt",'w',encoding='utf-8')

page = 2
while True:
    res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline?max_id=%s&page=%s' %(max_id,page),
        headers={'user-agent': my_header, 'X-INFINITESCROLL':'true'})

    last_li = etree.HTML(res.text).xpath('''/html/body/li[last()]''')[0]
    max_id = int(last_li.get('id').split('-')[1]) - 1
    page = page + 1
    file.write(res.text)
    file.write("\n")
    if '加入了简书' in res.text:
        print('end')
        break

file.close()
#https://www.jianshu.com/p/35a85ee14f7b

from selenium import webdriver
import time

options = webdriver.ChromeOptions()
prefs = {"profile.managed_default_content_settings.images":2}
options.add_experimental_option("prefs",prefs)
browser = webdriver.Chrome(chrome_options=options)
browser.set_page_load_timeout(60)

browser.get("https://www.jianshu.com/users/5aa8494a18c8/timeline")
time.sleep(5)

file = open("browser.txt",'w',encoding='utf-8')

while True:
    text = browser.find_element_by_xpath("""//*[@id="list-container"]/ul""").text
    file.write(text)
    # remove li elements
    js='''var nodeList=document.querySelectorAll("#list-container > ul > li");for(var i=0;i<nodeList.length-1;i++){nodeList[i].remove()}'''
    browser.execute_script(js)
    
    # scroll
    browser.execute_script("document.documentElement.scrollTop=0")
    browser.execute_script("document.documentElement.scrollTop=1600")
    time.sleep(10)
    
    if '加入了简书' in text:
        print("end")
        break

file.write(text)
file.close()

retrying 重试请求

pip install retrying
import requests
from retrying import retry

headers = {
    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Safari/537.36"
}

@retry(stop_max_attempt_number=3) # 表示重试以下代码三次
def _parse_url(url):
    print("-" * 30)
    response = requests.get(url, headers=headers, timeout=3)
    assert response.status_code == 200
    return response.content.decode()

def parse_url(url):
    try:
        html_str = _parse_url(url)
    except:
        html_str = None
    return html_str

if __name__ == "__main__":
    url = 'www.baidu.com'
    print(parse_url(url))
    
    https://learnku.com/articles/33001

ipython

使用 Tab 实现自动补全功能。
使用问号？呈现对象的说明。
提供了一系列以 % 开头的魔术命令为常见任务的执行提供便利。
提供了额外的绘图功能。
可使用操作系统中的命令。
用 %quickref 查看 IPython 的参考手册，也可以使用 %magic 来查看 IPython 中魔术命令的信息，以更好的使用 IPython 工具。
最近两次执行的结果 (Out) 分别保存在 _ 及 __ 变量中。而且，IPython 还采用 _iN, _N 的方式将所有输入及输出的历史记录进行保存～其中 N 为行号

In [8]: eval(_i4) # eval('1 + 7*9')
Out[8]: 64 
 IPython 中执行过的命令，可通过 %logstart 将之保存在 Python 文件中
 In [22]: !ls
 IPython.md                      data_science.md                 pandas.ipynb
 Jupyter.ipynb                   README.md
 
 IPython 同样提供了一些代码分析工具，如代码执行时间的分析工具 %time 和 %timeit、基本的性能分析 %prun 和 %run -p 等
 
 %time：整体执行一次，给出执行时间；
 %timeit：多次执行，给出平均时间。
 In [42]: %time 'foobar'.startswith('foo')
 CPU times: user 3 µs, sys: 0 ns, total: 3 µs
 Wall time: 5.96 µs
 Out[42]: True
 https://learnku.com/articles/33122#reply105653

列表中出现次数最多的数

1
2
3


test = [1, 2, 3, 4, 2, 2, 3, 1, 4, 4, 4, 4]
print(max(set(test), key=test.count))

pygame 模块

import pygame
from pygame import*
pygame.init()
size = (600,400) #窗口的大小
screem = pygame.display.set_mode(size) #把窗口的大小放进pygame.display.set_mode()函数里进行创建。
screem.fill((250,250,250))
写好了，但是窗口还是黑色，那是因为窗口还没刷新，用 pygame.display.update () 来进行刷新
https://learnku.com/articles/33209#reply105837

发送html格式邮件

#!/usr/bin/python
# -*- coding: utf-8 -*-
import smtplib, time, os
from email.mime.text import MIMEText
from email.header import Header

def send_mail_html(file):
    sender = 'admin@jinchuang.org' #发件人
    receiver = 'gaojing@jinchuang.org' #收件人
    t = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())  #获取当前时间
    subject = '博客磁盘使用报警_' + t  #邮件主题
    smtpserver = 'smtp.exmail.qq.com' #发送服务器地址
    username = 'admin@jinchuang.org' #用户名
    password = 'passwd' #密码

    f = open(file, 'rb')
    mail_body = f.read()
    f.close()


    msg = MIMEText(mail_body, _subtype='html', _charset='utf-8')
    msg['Subject'] = Header(subject, 'utf-8')
    msg['From'] = sender
    msg['To'] = receiver

    try:
        smtp = smtplib.SMTP()
        smtp.connect(smtpserver)
        smtp.login(username, password)
        smtp.sendmail(sender, receiver, msg.as_string())
    except:
        print("邮件发送失败！")
    else:
        print("邮件发送成功！")
    finally:
        smtp.quit()

file = '/tmp/df.html' #html文件
send_mail_html(file)

html表格模板
#!/bin/bash
ip=`ifconfig |grep -v 127 |grep inet|awk '{print $2}'`
a=`df -hT|grep -w "/"|awk '{print $1}'`
b=`df -hT|grep -w "/"|awk '{print $2}'`
c=`df -hT|grep -w "/"|awk '{print $3}'`
d=`df -hT|grep -w "/"|awk '{print $4}'`
e=`df -hT|grep -w "/"|awk '{print $5}'`
f=`df -hT|grep -w "/"|awk '{print $6}'`
g=`df -hT|grep -w "/"|awk '{print $7}'`

html="<html>
<meta http-equiv=\"content-type\" content=\"text/html;charset=utf-8\">
<head>
<style type=\"text/css\">
table{margin-top:5%;width:500px}
table.gridtable {
    font-family: verdana,arial,sans-serif;
    font-size:14px;
    color:#333333;
    border-width: 1px;
    border-color: #666666;
    border-collapse: collapse;
}
table.gridtable th {
    border-width: 1px;
    padding: 8px;
    background-color: #008eff;
    color:#fff;
}
table.gridtable td {
    border-width: 1px;
    padding: 8px;
    border-style: solid;
    border-color: #afafaf;
    background-color: #ffffff;
}
table tr:first-child td:first-child, table tr:first-child th:first-child{
  border-top-left-radius: 5px;
}
table tr:first-child td:last-child, table tr:first-child th:last-child{
  border-top-right-radius: 5px;
}
</style>
</head>
<body>
<table class=\"gridtable\" align=center>
<tr>
    <th colspan="7">告警主机:192.168.11.1</th>
</tr>
<tr>
    <td>文件系统</td><td>类型</td><td>总共</td><td>已用</td><td>可用</td><td>使用率</td><td>挂载点</td>
</tr>
<tr>
    <td>$a</td><td>$b</td><td>$c</td><td>$d</td><td>$e</td><td style=\"color:red;font-weight:bold\">$f</td><td>$g</td>
</tr>
</table>
</body>
</html>"
echo -e "$html" >/tmp/df.html

如果需要使用ssl协议，修改2个地方
smtpserver = 'smtp.exmail.qq.com:465' #加上465端口
smtp = smtplib.SMTP_SSL() #加上ssl
https://me.jinchuang.org/archives/272.html

pdf

import pdfkit

pdfkit.from_file('jianshu.htm','out.pdf')   
pdfkit.from_string('HelloWorld','out.pdf')

with open('jianshu.htm','r') as f:
	pdfkit.from_file(f,'out.pdf')   
	pdfkit.from_url(['https://www.jianshu.com/','https://www.baidu.com/'],'out.pdf')
    
    #pdfkit.from_file(['jianshu.htm','jianshu1.htm'],'out.pdf')  
    
     options={
        'page-size':'A4',#Letter
         'margin-top':'0.75in',
         'margin-right':'0.75in',
         'margin-bottom':'0.75in',
         'margin-left':'0.75in',
         'encoding':"UTF-8",
         'no-outline':None
     }    
     pdfkit.from_url('https://www.jianshu.com/','out1.pdf', options=options)
     
     https://juejin.im/post/5ce69794e51d4577523f22ef

天善博客内容如何转成PDF文档

wkhtmltopdf  'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599'  sengblog.pdf

wkhtmltopdf  --javascript-delay 2000 'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599'  sengblog.pdf

wkhtmltopdf  --dump-outline out.xsl  toc 'http://www.flybi.net/blog/seng/3645' 'http://www.flybi.net/blog/seng/3599'  sengblog.pdf

https://ask.hellobi.com/blog/seng/3691

如何批量添加图片水印

python 利用opencv去除图片水印 https://mp.weixin.qq.com/s/BqeBk0oPP1KpueviCwZFFQ

https://mp.weixin.qq.com/s/QnMzvq_VWs2HyKHhD4FxQg
import os,traceback
from PIL import Image

# 获取文件夹图片
def get_folder(fpath,wm_file,save_path):
    try:
        img_suffix_list = ['png', 'jpg', 'bmp']
        for i in os.listdir(fpath):
            if i.split('.')[-1] in img_suffix_list:
                img_path = fpath + '/' + i
                img_water_mark(img_file=img_path,wm_file=wm_file,save_path=save_path)
    except Exception as e:
        print(traceback.print_exc())

# 图片添加水印
def img_water_mark(img_file, wm_file,save_path):
    try:
        img = Image.open(img_file)  # 打开图片
        watermark = Image.open(wm_file)  # 打开水印
        img_size = img.size
        wm_size = watermark.size
        # 如果图片大小小于水印大小
        if img_size[0] &lt; wm_size[0]:
            watermark.resize(tuple(map(lambda x: int(x * 0.5), watermark.size)))
        print('图片大小：', img_size)
        wm_position = (img_size[0]-wm_size[0],img_size[1]-wm_size[1]) # 默认设定水印位置为右下角
        layer = Image.new('RGBA', img.size)  # 新建一个图层
        layer.paste(watermark, wm_position)  # 将水印图片添加到图层上
        mark_img = Image.composite(layer, img, layer)
        new_file_name = '/new_'+img_file.split('/')[-1]
        mark_img.save(save_path + new_file_name)
    except Exception as e:
        print(traceback.print_exc())

爬虫简书

import requests
from lxml import etree

my_header = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"

res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline', headers={'user-agent': my_header})

if '666' in res.text:
    print('found')
page = etree.HTML(res.text)
last_li = page.xpath('''//ul[@class="note-list"]/li[last()]''')[0]
max_id = int(last_li.get('id').split('-')[1]) - 1

file = open("activity.txt",'w',encoding='utf-8')

page = 2
while True:
    res = requests.get(url='https://www.jianshu.com/users/5aa8494a18c8/timeline?max_id=%s&page=%s' %(max_id,page),
        headers={'user-agent': my_header, 'X-INFINITESCROLL':'true'})

    last_li = etree.HTML(res.text).xpath('''/html/body/li[last()]''')[0]
    max_id = int(last_li.get('id').split('-')[1]) - 1
    page = page + 1
    file.write(res.text)
    file.write("\n")
    if '加入了简书' in res.text:
        print('end')
        break

file.close()
https://www.jianshu.com/p/35a85ee14f7b

markdown 转PDF

pip install markdown 
markdown.markdown() 函数就可以读取 md 文件里的内容了
先转HTML 
import markdown
import os
import codecs
'''
savepath = "F:\RenZhengfei"
os.chdir(savepath)
file = codecs.open("README.md",  mode="r", encoding="utf-8")
text = file.read()

html = markdown.markdown(text)
print(html)
with open('file_name.html', 'w') as f:
    f.write(html)
'''

head = """<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
<style type="text/css">
code {
  color: inherit;
  background-color: rgba(0, 0, 0, 0.05);
}
</style>
</head>
<body>
"""

foot = """
</body>
</html>
"""
filepath = "F:\RenZhengfei-master\ALL"
savepath = "F:\RenZhengfei-master\ALL-html"
if not os.path.isdir(savepath):
    os.mkdir(savepath)
os.chdir(savepath)

i = 0
pathDir = os.listdir(filepath)
for allDir in pathDir:
    if (allDir == "pdf"):
        continue
    name = allDir
    print(name)

    os.chdir(filepath)
    fp1 = codecs.open(name, mode="r", encoding="utf-8")
    text = fp1.read()
    html = markdown.markdown(text)
    fp1.close()
    #print(html)

    fname = name.replace('md', 'html')

    #f2 = '%s.html' % (fname)
    os.chdir(savepath)
    fp2 = codecs.open(fname, "w", encoding="utf-8", errors="xmlcharrefreplace")
    fp2.write(head + html + foot)
    fp2.close()

print(i)

https://wemp.app/posts/6f807ecf-9ebd-4449-b419-2cfbf8c2e41f

import time
import pdfkit
import os

wk_path = r'E:\Program Files\wkhtmltox\bin\wkhtmltopdf.exe'
config = pdfkit.configuration(wkhtmltopdf=wk_path)

filepath = "F:\RenZhengfei-master\ALL-html"
savepath = "F:\RenZhengfei-master\ALL-pdf"
time1 = time.time()
pathDir = os.listdir(filepath)
for allDir in pathDir:
    if (allDir == "pdf"):
        continue
    name = allDir
    print(name)
    htmlpath=filepath+"\\"+name
    print(htmlpath)
    name = name.replace('html', 'pdf')
    os.chdir(savepath)
    pdfkit.from_url(htmlpath, name, configuration=config)


#pdfkit.from_url(url, name, configuration=config)
time2 = time.time()
print(str(time2 - time1)+" s")

验证身份证号

import time

#生成出生当年所有日期
def dateRange(year):
    fmt = '%Y-%m-%d'
    bgn = int(time.mktime(time.strptime(year+'-01-01',fmt)))
    end = int(time.mktime(time.strptime(year+'-12-31',fmt)))
    list_date = [time.strftime(fmt,time.localtime(i)) for i in range(bgn,end+1,3600*24)]
    return [i.replace('-','') for i in list_date]

data_time  = dateRange('1993')


from id_validator import validator

#遍历所有日期，print通过校验的身份证号码
pip install id-validator
def vali_dator(id1,id2,id3):
    for i in dateRange(id2):
        theid = id1 + i + id3
        if validator.is_valid(theid):
            print(theid)

vali_dator('330221','1993','4914')

https://mp.weixin.qq.com/s?__biz=MzU5MjI3NzIxMw==&mid=2247486816&idx=1&sn=baa976db515e3b9b99e7001daa9a577a&chksm=fe2376d2c954ffc486625e5420e3ebcf3d83581986b0568b804fb5a54e4aaa032b4992c13905&mpshare=1&scene=1&srcid=1023PX0DRWmDc5E8oEZSVUx6&sharer_sharetime=1571795782903&sharer_shareid=43165518fc08bc947dca48788293333a&key=6f23511bf9e1c01f4c78d4f8f46e1b1e8fc6e548405a6029e3b015de7441c1527cd4817fc238470a3211f36f03178e6f7f9888d5f7d1ee5e6ef6b0b0fced5da2f45aa739e184ae5749a86f5102efd4f9&ascene=1&uin=NjQ3OTQwMTAy&devicetype=Windows+7&version=62070152&lang=zh_CN&pass_ticket=Nl73k%2FpmXYhrLnAbsjSStmagh1FEZZkB8fhtyVf9%2BmzY8foNNpPw%2FmaVHa2zPKdu
#print(validator.get_info('330221199306084914'))
https://github.com/zpw1995/aotodata/blob/master/interest/ID_card/ID_card.py

NumPy 基础

NumPy 的主要对象是多维数组 Ndarray。在 NumPy 中维度（dimensions）叫做轴（axes），轴的个数叫做秩（rank）
>>> np.array([1, 2, 3])
array([1, 2, 3])
>>> np.array([(1, 2, 3), (4, 5, 6)])
array([[1, 2, 3],
       [4, 5, 6]])
>>> np.zeros((3, 3))
array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])
>>> np.arange(5)
array([0, 1, 2, 3, 4])
>>> np.arange(6).reshape(2, 3)
array([[0, 1, 2],
       [3, 4, 5]])
    >>> np.random.rand(2, 3)
    array([[0.50122984, 0.98824375, 0.81388012],
           [0.60951775, 0.02055326, 0.97622093]])   
   >>> np.random.randint(5, size=(2, 3))
   array([[2, 0, 2],
          [4, 4, 4]])
          
   https://learnku.com/articles/35684
 >>> a = np.array([1, 2, 3, 4, 5])
 >>> b = np.arange(1, 6)
 >>> a, b
 (array([1, 2, 3, 4, 5]), array([1, 2, 3, 4, 5]))

###

爬取豆瓣电影TOP250
import requests
from bs4 import BeautifulSoup

url = 'https://movie.douban.com/top250'
# 使用U-A伪装成浏览器发送请求
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'}
# 先使用requests发送网络请求从而获取网页
r = requests.get('https://movie.douban.com/top250', headers=headers)
# 使用bs4解析获取的网页
soup = BeautifulSoup(r.text, 'html.parser')
# 调用prettify()方法来使解析的HTML更加规范化
print(soup.prettify())
movie_list = soup.find('ol', attrs={'class': 'grid_view'}) #电影列表

for movie in movie_list.find_all('li'):
    movie_name = movie.find('span', attrs={'class': 'title'})
    print(movie_name.get_text())
    
肖申克的救赎
霸王别姬
这个杀手不太冷
阿甘正传
美丽人生
import codecs
import requests
from bs4 import BeautifulSoup

DOWNLOAD_URL = 'https://movie.douban.com/top250'

def download_page(url):
    return requests.get(url, headers={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
    }).content

def parse_html(html):
    soup = BeautifulSoup(html, 'html.parser')
    # 电影列表
    movie_list_soup = soup.find('ol', attrs={'class': 'grid_view'})
    movie_name_list = []
    for movie_li in movie_list_soup.find_all('li'):
        movie_name = movie_li.find('span', attrs={'class': 'title'}).get_text()
        movie_info = movie_li.find('div', attrs={'class': 'bd'}).find('p').get_text()
        movie_star = movie_li.find('span', attrs={'class': 'rating_num'}).get_text()
        movie_name_list.append(movie_name)
        movie_name_list.append(movie_info)
        movie_name_list.append(movie_star)
    # 下一页链接
    next_page = soup.find('span', attrs={'class': 'next'}).find('a')
    if next_page:
        return movie_name_list,DOWNLOAD_URL + next_page['href']
    return movie_name_list, None

def main():
    url = DOWNLOAD_URL
    with codecs.open('movies','wb', encoding='utf-8') as f:
        while True:
            html = download_page(url)
            movies, url =parse_html(html)
            f.write(u'{movies}\n'.format(movies='\n'.join(movies)))

if __name__ == '__main__':
    main()
https://www.jianshu.com/p/8a460be5a26e

把bmp和png转换成jpg

import os
from PIL import Image

for root, dirs, files in os.walk("."):
    for bmpfig in files:
        if not bmpfig.endswith('.bmp') and not bmpfig.endswith('.png'):
            continue
        bmpfig = os.path.join(root, bmpfig)
        newfigname = bmpfig[:-4] + ".jpg"
        print "converting from", bmpfig, "to", newfigname
        img = Image.open(bmpfig)
        img = img.convert('RGB')  # for png
        img.save(newfigname, format='jpeg', quality=95)
        img.close()
        os.remove(bmpfig)
https://zjyfdu.github.io/2018/08/16/python%E6%8A%8Abmp%E8%BD%AC%E6%8D%A2%E6%88%90jpg/

numpy

import numpy as np
>>> print(np.__version__)
1.16.2
>>> np.array([1, 2, 3])
array([1, 2, 3])
>>> np.arange(5)
array([0, 1, 2, 3, 4])
>>> np.arange(6).reshape(2, 3)
array([[0, 1, 2],
       [3, 4, 5]])
>>> np.random.rand(2, 3)
array([[0.50122984, 0.98824375, 0.81388012],
       [0.60951775, 0.02055326, 0.97622093]])
  >>> np.random.randint(5, size=(2, 3))
  array([[2, 0, 2],
         [4, 4, 4]])     
       
  >>> a = np.array([1, 2, 3, 4, 5])
  >>> b = np.arange(1, 6)     
  >>> a + b 
  array([ 2,  4,  6,  8, 10])     
   >>> np.sin(a)
   array([-0.54402111,  0.91294525, -0.98803162,  0.74511316, -0.26237485])
   
   >>> np.sqrt(a)
   array([3.16227766, 4.47213595, 5.47722558, 6.32455532, 7.07106781])
   >>> a ** 0.5 # 等价于np.sqrt(a)
   array([3.16227766, 4.47213595, 5.47722558, 6.32455532, 7.07106781])  
   >>> np.power(a, 3)
   array([  1000,   8000,  27000,  64000, 125000])
   >>> a ** 3 # 等价于np.power(a, 3)  
   https://learnku.com/articles/35686

pdf to image

https://github.com/freedesktop/poppler  
pdftoppm -singlefile -f 4 -r 72 -jpeg -jpegopt quality=90 presentation.pdf test_poppler

pdftoppm  -f 1 -r 72 -jpeg -jpegopt quality=90 test_20191120_134947.pdf test_poppler
转换所有 生成多个图片
from pdf2image import convert_from_path

def main():
    pages = convert_from_path("presentation.pdf", first_page=2,
                              single_file=True)
    pages[0].save("test_pdf2image.jpg", quality=85)

if __name__ == "__main__":
    main()
https://jdhao.github.io/2019/11/14/convert_pdf_to_images_pdftoppm/
https://imagemagick.org/script/download.php
convert -density 150 presentation.pdf -quality 90 output-%3d.jpg
https://jdhao.github.io/2019/11/20/convert_pdf_to_image_imagemagick/#convert-all-pages-of-pdf-file-to-images

图片exif

https://www.irfanview.com/
from PIL import Image
from PIL. from PIL.ExifTags import TAGS

img = Image.open('test.jpg')

exif = img.getexif()

for k, v in exif.items():
    print('{}: {}'.format(TAGS[k], v))
 from PIL import Image
 import piexif
 
 img = Image.open('test.jpg')
 if "exif" in img.info:
     exif_dict = piexif.load(img.info['exif'])
 
 if piexif.ImageIFD.Orientation in exif_dict['0th']:
     exif_dict['0th'][pixeif.ImageIFD.Orientation] = 3
 
     # quick and dirty work around to avoid type error
     exif_dict['Exif'][41729] = b'1'
 
     exif_bytes = piexif.dump(exif_dict)
 
 img.save('new_img.jpg', exif=exif_bytes)   
https://jdhao.github.io/2019/07/31/image_rotation_exif_info/

命令行里处理数据科学问题

curl -o data_dl.csv https://archive.ics.uci.edu/ml/machine-learning-databases/blood-transfusion/transfusion.data
Recency (months),Frequency (times),Monetary (c.c. blood),Time (months),"whether he/she donated blood in March 2007"
2 ,50,12500,98 ,1
0 ,13,3250,28 ,1
1 ,16,4000,35 ,1
2 ,20,5000,45 ,1
1 ,24,6000,77 ,0

pip install csvkit
csvclean data_dl.csv
csvcut -n data_dl_out.csv | cut -c6-
Recency (months)
Frequency (times)
Monetary (c.c. blood)
Time (months)
whether he/she donated blood in March 2007

csvstat --mean data_dl_out.csv
1. a: 373.5
2. Recency (months): 9.507
3. Frequency (times): 5.515
4. Monetary (c.c. blood): 1,378.676
5. Time (months): 34.282
6. whether he/she donated blood in March 2007: None

import pandas as pd
data = pd.read_csv('data_dl_out.csv')
data.head()

data = data.rename(columns={'Recency (months)': 'recency',
             'Frequency (times)': 'frequency',
             'Monetary (c.c. blood)': 'volumne',
             'Time (months)': 'time',
             'whether he/she donated blood in March 2007': 'target'})
data.to_csv('data_clean.csv')

	recency	frequency	volumne	time	target
0	2	50	12500	98	1
1	0	13	3250	28	1
2	1	16	4000	35	1
3	2	20	5000	45	1

csvsql --query  "select frequency, count(*) as rows from data_clean where target = 1 group by frequency order by 2 desc" data_clean.csv
d:\python\lib\site-packages\win32\lib\pywintypes.py:2: DeprecationWarning: the imp module is deprecated in favour of importlib; see the module's documentation for alternative uses
d:\python\lib\site-packages\agate\utils.py:276: UnnamedColumnWarning: Column 0 has no name. Using "a".
frequency,rows
1.0,20
5.0,20
2.0,19
6.0,17
https://oicebot.github.io/2019/07/25/five-command-line-tools-for-data-science.html

Pandas 做数据分析sql

import pandas as pd
import numpy as np

url = ('https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv')
tips = pd.read_csv(url)
output = tips.head()
sql 语句： SELECT total_bill, tip, smoker, time FROM tips LIMIT 5;。

output = tips[['total_bill', 'tip', 'smoker', 'time']].head(5)
   total_bill   tip smoker    time
0       16.99  1.01     No  Dinner
1       10.34  1.66     No  Dinner
2       21.01  3.50     No  Dinner
3       23.68  3.31     No  Dinner
4       24.59  3.61     No  Dinner
sql 语句： SELECT * FROM tips WHERE time = 'Dinner' LIMIT 5;

output = tips[tips['time'] == 'Dinner'].head(5)
# 或者
output = tips.query("time == 'Dinner'").head(5)

sql 语句：SELECT * FROM tips WHERE time = 'Dinner';。

output = tips[(tips['time'] == 'Dinner')]
sql 语句：SELECT * FROM tips WHERE time = 'Dinner' AND tip > 5.00;

output = tips[(tips['time'] == 'Dinner') & (tips['tip'] > 5.00)]
sql 语句：SELECT * FROM tips WHERE size >= 5 OR total_bill > 45;。

output = tips[(tips['size'] >= 5) | (tips['total_bill'] > 45)]
sql 语句：SELECT * FROM tips WHERE siez in (5, 6);。

output = tips[tips['size'].isin([2, 5])]
sql 语句：SELECT sex, count(*) FROM tips GROUP BY sex;
https://learnku.com/articles/29825#replies
output = tips.groupby('sex').size()
转数组
>>> tips.total_bill.head().tolist()
[16.99, 10.34, 21.01, 23.68, 24.59]
>>> tips.columns.tolist()
['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

在 Python 里用 is 进行比较

if appointment.time_slot_id is time_slot.id:
    time_slot_appointments.append(appointment)
出问题的地方就是这个 is 啦。但最诡异的是，当 time_slot_id 和 time_slot.id 都是小于等于 256 的整型数字时，这段代码一点问题都没有；只有当预约数量达到一定程度，使得 time_slot_id 或 time_slot.id 大于 256 时，问题就出现了——这个表达式永远返回 False

使用 is 关键字进行的比较是“引用比较”。这里的“引用”就相当于一个索引号，一个地址，或是指向一个对象的指针。用 is 进行比较正是造成这个奇怪 bug 的根源。
使用 == 操作符进行的比较是“值比较”，也就是比较两个对象的“值”。
在 Python 中，数值型的整型数据是以 PyObject 对象的一个子类型： PyLong 对象的形式存储的。为了减少内存管理在处理小整型数字时候的开销，在 CPython 解释器中使用了“小整数对象池”进行优化。也就是说，值为 -5 到 256 的 PyLong 对象已经预置在 CPython 解释器的私有堆中，可以通过 small_ints 这个数组进行访问。
要想修复这个 bug，其中一种方式是，把：

if appointment.time_slot_id is time_slot.id:
改成：

if appointment.time_slot.id is time_slot.id:
只有当你十分确定要比较的是两个对象本身的时候，才用 is 进行比较。https://oicebot.github.io/2019/07/11/the-dangers-of-using-is-in-python.html

判断括号字符串

def isValid(self, s):
    stack = []
    paren_map = {')':'(', ']':'[', '}':'{'}

    for c in s:
        if c not in paren_map:
            stack.apend(c)
        elif not stack or paren[c] != stack.pop():
            return False
    return not stack

Could not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate

pip install iredis
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/iredis/
Could not fetch URL https://pypi.org/simple/iredis/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/iredis/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
ERROR: Could not find a version that satisfies the requirement iredis (from versions: none)
ERROR: No matching distribution found for iredis
Could not fetch URL https://pypi.org/simple/pip/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/pip/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
到https://pypi.python.org/pypi/pip#downloads下载 https://files.pythonhosted.org/packages/ce/ea/9b445176a65ae4ba22dce1d93e4b5fe182f953df71a145f557cffaffc1bf/pip-19.3.1.tar.gz

解压出setup.py   执行python setup.py install
Installing pip.exe script to D:\python\Scripts
Installing pip.exe.manifest script to D:\python\Scripts
Installing pip3-script.py script to D:\python\Scripts
Installing pip3.exe script to D:\python\Scripts
Installing pip3.exe.manifest script to D:\python\Scripts
Installing pip3.7-script.py script to D:\python\Scripts
Installing pip3.7.exe script to D:\python\Scripts
Installing pip3.7.exe.manifest script to D:\python\Scripts

Installed d:\python\lib\site-packages\pip-19.3.1-py3.7.egg
Processing dependencies for pip==19.3.1
Finished processing dependencies for pip==19.3.1


pip install --trusted-host pypi.org --trusted-host files.pythonhosted.org iredis
Collecting iredis
  Downloading https://files.pythonhosted.org/packages/5c/1f/2da6df9c698a586f66bbb4153b7b2a75c62ce1e94aaf04ffaed1954163ad/iredis-0.7.0-py3-none-any.whl (42kB)
     |████████████████████████████████| 51kB 363kB/s
Collecting click8<9,>=8

单行代码

# 字典推导
{v: k for k, v in some_dict.items()}
# 集合推导
{x**2 for x in [1, 1, 2]}
# 列表推导
[i for i in range(30) if i % 3 is 0]
a_list = [[1, 2], [3, 4], [5, 6]]
print(list(itertools.chain.from_iterable(a_list)))
# Output: [1, 2, 3, 4, 5, 6]
sum(a_list,[])
[1, 2, 3, 4, 5, 6]
# or
print(list(itertools.chain(*a_list)))
# Output: [1, 2, 3, 4, 5, 6]
python -c "import csv,json;print json.dumps(list(csv.reader(open('csv_file.csv'))))"
python -m cProfile my_script.py
cat file.json | python -m json.tool
https://learnku.com/articles/39048#reply125307

python算出了同事的身份证号码

//https://mp.weixin.qq.com/s?__biz=MzU5MjI3NzIxMw==&mid=2247486816&idx=1&sn=baa976db515e3b9b99e7001daa9a577a&chksm=fe2376d2c954ffc486625e5420e3ebcf3d83581986b0568b804fb5a54e4aaa032b4992c13905&mpshare=1&scene=1&srcid=1023PX0DRWmDc5E8oEZSVUx6&sharer_sharetime=1571795782903&sharer_shareid=43165518fc08bc947dca48788293333a&key=6f23511bf9e1c01f4c78d4f8f46e1b1e8fc6e548405a6029e3b015de7441c1527cd4817fc238470a3211f36f03178e6f7f9888d5f7d1ee5e6ef6b0b0fced5da2f45aa739e184ae5749a86f5102efd4f9&ascene=1&uin=NjQ3OTQwMTAy&devicetype=Windows+7&version=62070152&lang=zh_CN&pass_ticket=Nl73k%2FpmXYhrLnAbsjSStmagh1FEZZkB8fhtyVf9%2BmzY8foNNpPw%2FmaVHa2zPKdu

用python生成1993年的所有日期吧



import time

#生成出生当年所有日期
def dateRange(year):
    fmt = '%Y-%m-%d'
    bgn = int(time.mktime(time.strptime(year+'-01-01',fmt)))
    end = int(time.mktime(time.strptime(year+'-12-31',fmt)))
    list_date = [time.strftime(fmt,time.localtime(i)) for i in range(bgn,end+1,3600*24)]
    return [i.replace('-','') for i in list_date]

data_time  = dateRange('1993')
pip install id-validator

from id_validator import validator

#遍历所有日期，print通过校验的身份证号码

def vali_dator(id1,id2,id3):
    for i in dateRange(id2):
        theid = id1 + i + id3
        if validator.is_valid(theid):
            print(theid)

vali_dator('330221','1993','4914')
打开12306官网，



在12306添加常用联系人，



将李大伟+身份证号依次输入。



若身份证和姓名一致，就会显示校验通过；



若不能通过，则说明身份证和姓名不一致。 https://github.com/zpw1995/aotodata/tree/master/interest/ID_card

b站弹幕

from bs4 import BeautifulSoup
import pandas as pd
import requests
#https://github.com/zpw1995/aotodata/blob/master/bilibili_danmu/B%E7%AB%99%E5%BC%B9%E5%B9%95%E7%88%AC%E8%99%AB.py
url = 'http://comment.bilibili.com/123519261.xml'
html = requests.get(url)
html.encoding='utf8'

soup = BeautifulSoup(html.text, 'lxml')
results = soup.find_all('d')

comments = [comment.text for comment in results]
comments_dict = {'comments': comments}

df = pd.DataFrame(comments_dict)
df.to_csv('bili_ai5.csv', encoding='utf-8-sig')

Pandas

 import pandas as pd
 #读取csv
 df = pd.read_csv('xxx.csv')

 #pkl格式https://learnku.com/articles/39739
 df.to_pickle('xxx.pkl') #格式另存
 df = pd.read_pickle('xxx.pkl') #读取

 #hdf格式
df.to_hdf('xxx.hdf','df') #格式另存
df = pd.read_hdf('xxx.pkl','df') #读取

boolean=[True,False]
gender=["男","女"]
color=["white","black","yellow"]
data=pd.DataFrame({
    "height":np.random.randint(150,190,100),
    "weight":np.random.randint(40,90,100),
    "smoker":[boolean[x] for x in np.random.randint(0,2,100)],
    "gender":[gender[x] for x in np.random.randint(0,2,100)],
    "age":np.random.randint(15,90,100),
    "color":[color[x] for x in np.random.randint(0,len(color),100) ]
}
)
>>> data
    height  weight  smoker gender  age   color
0      186      77   False      女   59   black
1      162      62   False      女   75  yellow
2      187      78   False      男   66   black
3      166      45    True      男   38   white
#①使用字典进行映射
data["gender"] = data["gender"].map({"男":1, "女":0})

#②使用函数
def gender_map(x):
    gender = 1 if x == "男" else 0
    return gender
#注意这里传入的是函数名，不带括号
data["gender"] = data["gender"].map(gender_map)
def apply_age(x,bias):
    return x+bias

#以元组的方式传入额外的参数
data["age"] = data["age"].apply(apply_age,args=(-3,))
# 沿着0轴求和
data[["height","weight","age"]].apply(np.sum, axis=0)

# 沿着0轴取对数
data[["height","weight","age"]].apply(np.log, axis=0)
def BMI(series):
    weight = series["weight"]
    height = series["height"]/100
    BMI = weight/height**2
    return BMI

data["BMI"] = data.apply(BMI,axis=1)
df.applymap(lambda x:"%.2f" % x)
https://learnku.com/articles/39734

In [12]: data.groupby("company").agg('mean')
Out[12]:
         salary    age
company
A         21.50  27.50
B         13.00  29.00
C         29.25  27.25
In [17]: data.groupby('company').agg({'salary':'median','age':'mean'})
Out[17]:
         salary    age
company
A          21.5  27.50
B          10.0  29.00
C          30.0  27.25

In [24]: data['avg_salary'] = data.groupby('company')['salary'].transform('mean')
https://learnku.com/articles/39735

requests 抓取网页的通用框架

# -*- coding: utf-8 -*- 
import requests

def getHtmlText(url):
    try:
        response = requests.get(url)
        # 如果状态码不是 200, 则应发 HTTPERROR 异常
        response.raise_for_status()
        # 设置正确的编码方式
        response.encoding = response.apparent_encoding
        return response.text
    except:
        return "Something Wrong!"

url = 'http://www.baidu.com'

result = getHtmlText(url)
print(result)
http://www.siya89.com/blog/python%20zero

Could not fetch URL


 pip install -U requests
WARNING: Retrying (Retry(total=4, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=3, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=2, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=1, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
WARNING: Retrying (Retry(total=0, connect=None, read=None, redirect=None, status=None)) after connection broken by 'SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))': /simple/requests/
Could not fetch URL https://pypi.org/simple/requests/: There was a problem confirming the ssl certificate: HTTPSConnectionPool(host='pypi.org', port=443): Max retries exceeded with url: /simple/requests/ (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1045)'))) - skipping
Requirement already up-to-date: requests in d:\python\lib\site-packages (2.21.0)
Requirement already satisfied, skipping upgrade: idna<2.9,>=2.5 in d:\python\lib\site-packages (from requests) (2.8)

 pip --trusted-host pypi.doubanio.com install -U tqdm -i http://pypi.doubanio.com/simple
Looking in indexes: http://pypi.doubanio.com/simple
Collecting tqdm
  Downloading http://pypi.doubanio.com/packages/4a/1c/6359be64e8301b84160f6f6f7936bbfaaa5e9a4eab6cbc681db07600b949/tqdm-4.45.0-py2.py3-none-any.whl (60kB)
     |████████████████████████████████| 61kB 1.9MB/s
Installing collected packages: tqdm
  Found existing installation: tqdm 4.28.1
    Uninstalling tqdm-4.28.1:
      Successfully uninstalled tqdm-4.28.1
Successfully installed tqdm-4.45.0

创建或修改配置文件（linux的文件在~/.pip/pip.conf，windows在%HOMEPATH%\pip\pip.ini），修改内容为：

code:

[global]

index-url = http://pypi.douban.com/simple

Python Requests throwing SSLError

import urllib3

urllib3.disable_warnings()
pip install certifi


>>> import requests
>>> requests.certs.where()
'D:\\python\\lib\\site-packages\\certifi\\cacert.pem'
requests.get(url, verify=False)
cafile = 'cacert.pem' # http://curl.haxx.se/ca/cacert.pem
r = requests.get(url, verify=cafile)

requests.get("https://api.github.com/events", verify=True, cert=['/path/to/my/ca.crt'])

https://stackoverflow.com/questions/10667960/python-requests-throwing-sslerror
https://requests.readthedocs.io/en/latest/user/advanced/#ssl-cert-verification